update scripts

Dando18 · Dando18 · commit 17e4dc52c48e · 2024-02-20T08:56:00.000-08:00
diff --git a/generate/generate.py b/generate/generate.py
@@ -114,7 +114,7 @@
 )
 
 """ Iterate over prompts and generate code """
-if not args.restart and args.cache is not None:
+if not args.restart and args.cache is not None and os.path.exists(args.cache):
     with open(args.cache, 'r') as jsonl_file:
         responses = [json.loads(line) for line in jsonl_file]
         responses = [r for r in responses if r["temperature"] == args.temperature and r["prompted"] == args.prompted
@@ -133,7 +133,7 @@
         prompt_str = cur_prompt["prompt"]
 
     total_tokens += len(generator.tokenizer.encode(output[0]["generated_text"]))
-    cleaned_output = clean_output(output[0]["generated_text"], prompt_str)
+    cleaned_output = inference_config.clean_output(output[0]["generated_text"], prompt_str)
     cur_prompt["outputs"].append(cleaned_output)
 
     if idx % args.num_samples_per_prompt == args.num_samples_per_prompt - 1:
diff --git a/generate/utils.py b/generate/utils.py
@@ -1,12 +1,71 @@
 # std imports
 from abc import ABC, abstractmethod
+import re
 
 # tpl imports
 import torch
 from torch.utils.data import Dataset
 from transformers import StoppingCriteria
 
 
+def clean_output(output : str, prompt : str) -> str:
+    """ Remove `prompt` from the begging of `output`.
+        Also truncate at the end of the function definition (i.e. matching closing brace).
+    """
+    # replace up to the end of the first instance of prompt
+    prompt_loc = output.find(prompt)
+    if prompt_loc == -1:
+        raise ValueError(f"Prompt not found in output: {prompt}")
+    output = output[prompt_loc + len(prompt):].strip()
+
+    # temporarily add opening brace to the beginning
+    output = '{' + output
+
+    # find the matching brace to output[0]
+    stack = []
+    index = 0
+    while index < len(output):
+        token = output[index]
+        if token == '{':
+            stack.append(token)
+        elif token == '}':
+            stack.pop()
+            if len(stack) == 0:
+                break
+
+        index += 1
+
+    # truncate at the matching brace
+    output = output[1:index+1]
+    return output
+
+GPU_FUNCTION_NAME_PATTERN = re.compile(r"__global__ void ([a-zA-Z0-9_]+)\(")
+CPU_FUNCTION_NAME_PATTERN = re.compile(r"\s*[a-zA-Z_]+ ([a-zA-Z0-9_]+)\(")
+def get_function_name(prompt: str, execution_model: str) -> str:
+    if execution_model in ['cuda', 'hip']:
+        match = GPU_FUNCTION_NAME_PATTERN.match(prompt.splitlines()[-1])
+    else:
+        match = CPU_FUNCTION_NAME_PATTERN.match(prompt.splitlines()[-1])
+    if match is None:
+        raise ValueError(f"Could not find function name in prompt: {prompt}")
+    return match.group(1)
+
+
+def find_matching_brace_index(code: str, open_brace_index: int) -> int:
+    """Finds the index of the closing brace that matches the opening brace at the given index."""
+
+    brace_count = 1
+    for i in range(open_brace_index + 1, len(code)):
+        if code[i] == "{":
+            brace_count += 1
+        elif code[i] == "}":
+            brace_count -= 1
+            if brace_count == 0:
+                return i
+
+    raise ValueError("Unmatched opening brace")
+
+
 class InferenceConfig(ABC):
 
     def __init__(self, prompted : bool = False):
@@ -36,6 +95,10 @@ def trust_remote_code(self) -> bool:
     def format_prompt(self, prompt : str) -> str:
         pass
 
+    @abstractmethod
+    def clean_output(self, output: str, prompt: str) -> str:
+        pass
+
 
 class StarCoderConfig(InferenceConfig):
 
@@ -63,6 +126,9 @@ def format_prompt(self, prompt : str) -> str:
             return f"<filename>solutions/solution_1.cpp\n// here is the correct implementation of the coding exercise\n\n{prompt}"
         return prompt.strip()
 
+    def clean_output(self, output: str, prompt: str) -> str:
+        return clean_output(output, prompt)
+
 class CodeLlamaConfig(InferenceConfig):
 
     def __init__(self, prompted : bool = False):
@@ -90,6 +156,8 @@ def format_prompt(self, prompt : str) -> str:
             return f"// filename: solutions/solution_1.cpp\n// here is the correct implementation of the coding exercise\n\n{prompt}"
         return prompt.strip()
 
+    def clean_output(self, output: str, prompt: str) -> str:
+        return clean_output(output, prompt)
 
 class PolyCoderConfig(InferenceConfig):
 
@@ -116,6 +184,9 @@ def format_prompt(self, prompt : str) -> str:
         if self.prompted:
             return f"// filename: solutions/solution_1.cpp\n// here is the correct implementation of the coding exercise\n\n{prompt}"
         return prompt.strip()
+    
+    def clean_output(self, output: str, prompt: str) -> str:
+        return clean_output(output, prompt)
 
 
 class PhindConfig(InferenceConfig):
@@ -144,6 +215,9 @@ def format_prompt(self, prompt : str) -> str:
             return f"// filename: solutions/solution_1.cpp\n// here is the correct implementation of the coding exercise\n\n{prompt}"
         return prompt.strip()
 
+    def clean_output(self, output: str, prompt: str) -> str:
+        return clean_output(output, prompt)
+
 
 class ReplitConfig(InferenceConfig):
 
@@ -174,6 +248,92 @@ def format_prompt(self, prompt : str) -> str:
             return f"// filename: solutions/solution_1.cpp\n// here is the correct implementation of the coding exercise\n\n{prompt}"
         return prompt.strip()
 
+    def clean_output(self, output: str, prompt: str) -> str:
+        return clean_output(output, prompt)
+
+
+class MagicoderConfig(InferenceConfig):
+
+    PROMPT_TEMPLATE = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+
+@@ Instruction
+{instruction}
+
+@@ Response
+"""
+
+    def __init__(self, prompted : bool = False):
+        super().__init__(prompted=prompted)
+
+    def get_dtype(self):
+        return torch.bfloat16
+
+    def init_padding(self, tokenizer):
+        tokenizer.pad_token_id = tokenizer.eos_token_id  # for batching
+        tokenizer.padding_side = "left"   # for decoder-only models
+        pass
+
+    def get_pad_token_id(self, tokenizer) -> int:
+        return tokenizer.pad_token_id
+
+    def get_eos_token_id(self, tokenizer) -> int:
+        return tokenizer.eos_token_id
+    
+    def trust_remote_code(self) -> bool:
+        return False
+
+    def format_prompt(self, prompt : str) -> str:
+        if self.prompted:
+            function_name = get_function_name(prompt, "cuda" if "__global__" in prompt else "serial")
+            prompt = f"Complete the following c++ function.\n```c++{prompt.strip()}```\nWrite only the function {function_name} and no other code. Enclose your solution in ```c++ and ```."
+            return self.PROMPT_TEMPLATE.format(instruction=prompt)
+        return prompt.strip()
+
+    def clean_output(self, output: str, prompt: str) -> str:
+        """ Clean LLM output to find code solution. The output should be in a ```c++ ``` code block. If there are
+            multiple, then it tries to find the block with the function definition (as contained in the prompt).
+            The code block itself may include the function definition and body OR just the body. This will try
+            to parse both.
+        """
+        # 0. replace up to the end of the first instance of prompt
+        prompt_loc = output.find("@@ Response")
+        if prompt_loc == -1:
+            raise ValueError(f"Prompt not found in output: {prompt}")
+        output = output[prompt_loc + len("@@ Response"):].strip()
+
+        # 1. Find all code blocks enclosed in triple backticks with "c++" language tag
+        code_blocks = re.findall(r"```c\+\+\n(.*?)\n```", output, flags=re.DOTALL)
+        code_blocks = [block.lstrip('```c++').rstrip('```') for block in code_blocks]
+
+        # 2. Prioritize code blocks containing the function definition from the prompt
+        sub_prompt = prompt.rstrip().removesuffix("@@ Response").rstrip().removesuffix("```").split("```")[-1]
+        function_name = get_function_name(sub_prompt, "cuda" if "__global__" in sub_prompt else "serial")
+        prioritized_blocks = [block for block in code_blocks if function_name in block]
+
+        # 3. Choose the first block if multiple match, or any block if none match
+        if len(code_blocks) > 0:
+            selected_block = prioritized_blocks[0] if prioritized_blocks else code_blocks[0]
+        else:
+            if '```c++' in output: # starts with ```c++ but it didn't finish
+                code_idx = output.find('```c++')
+                selected_block = output[code_idx:].removeprefix('```c++')
+            else:
+                selected_block = output
+
+        # 4. Handle cases where the block contains only the function body
+        if function_name not in selected_block:
+            return selected_block
+        else:
+            function_start_index = selected_block.index(function_name)
+            open_brace_index = selected_block.find("{", function_start_index)
+            try:
+                close_brace_index = find_matching_brace_index(selected_block, open_brace_index)
+            except ValueError:
+                close_brace_index = len(selected_block)
+
+            function_body = selected_block[open_brace_index + 1 : close_brace_index]
+            return function_body + "}"
+
 
 def get_inference_config(model_name : str, **kwargs) -> InferenceConfig:
     if model_name == "bigcode/starcoderbase":
@@ -186,41 +346,12 @@ def get_inference_config(model_name : str, **kwargs) -> InferenceConfig:
         return PhindConfig(**kwargs)
     elif model_name == 'replit/replit-code-v1_5-3b':
         return ReplitConfig(**kwargs)
+    elif model_name.startswith('ise-uiuc/Magicoder'):
+        return MagicoderConfig(**kwargs)
     else:
         raise ValueError(f"Unknown model name: {model_name}")
 
 
-def clean_output(output : str, prompt : str) -> str:
-    """ Remove `prompt` from the begging of `output`.
-        Also truncate at the end of the function definition (i.e. matching closing brace).
-    """
-    # replace up to the end of the first instance of prompt
-    prompt_loc = output.find(prompt)
-    if prompt_loc == -1:
-        raise ValueError(f"Prompt not found in output: {prompt}")
-    output = output[prompt_loc + len(prompt):].strip()
-
-    # temporarily add opening brace to the beginning
-    output = '{' + output
-
-    # find the matching brace to output[0]
-    stack = []
-    index = 0
-    while index < len(output):
-        token = output[index]
-        if token == '{':
-            stack.append(token)
-        elif token == '}':
-            stack.pop()
-            if len(stack) == 0:
-                break
-
-        index += 1
-
-    # truncate at the matching brace
-    output = output[1:index+1]
-    return output
-
 class PromptDataset(Dataset):
     ''' PyTorch dataset that simply wraps a list of strings. They do not have to have the same length.
     '''