added JGA* metric compute, improved outputs file with extracted values, improved inverse prompt

3 years ago · 678b837e6f
parent 52f9bfa8d1
commit 678b837e6f
4 changed files with 99 additions and 23 deletions
--- a/prompt-learning/metrics.py
+++ b/prompt-learning/metrics.py
@ -27,7 +27,7 @@ class PromptDSTEvaluator:
            raise ValueError('Length mismatch!')
        # keep a count for computing JGA
-        correctly_predicted, total_turns = 0, 0
+        correctly_generated, total_turns = 0, 0
        for truth, generated in zip(self.true_states_list, self.gen_states_list):
            total_turns += 1
@ -41,9 +41,46 @@ class PromptDSTEvaluator:
                    has_wrong_slot_value = True
                    break
            if not has_wrong_slot_value:
-                correctly_predicted += 1
+                correctly_generated += 1
-        jga_score = (correctly_predicted / total_turns) * 100
+        jga_score = round((correctly_generated / total_turns) * 100, 2)
        if not no_print:
-            print('Joint Goal Accuracy = ', jga_score)
+            print('Joint Goal Accuracy (JGA) = ', jga_score)
        return jga_score
    def compute_jga_for_correct_values(self, no_print=False):
        if not no_print:
            print('Computing Joint Goal Accuracy metric only where values are extracted correctly!')
        if len(self.true_states_list) != len(self.gen_states_list):
            raise ValueError('Length mismatch!')
        # keep a count for computing JGA
        correctly_generated, total_turns = 0, 0
        for truth, generated in zip(self.true_states_list, self.gen_states_list):
            total_turns += 1
            # compare the extracted values with true state values
            # use only the correctly extracted values while computing JGA*
            extracted_values = list(generated.values())
            true_values = list(truth.values())
            correct_values = list(set(true_values).intersection(extracted_values))
            # if no extracted correct values, then continue
            if len(correct_values) <= 0:
                continue
            has_wrong_slot_value = False
            for slot, value in truth.items():
                if value in correct_values:
                    if slot not in generated or truth[slot] != generated[slot]:
                        has_wrong_slot_value = True
                        break
            if not has_wrong_slot_value:
                correctly_generated += 1
        jga_star = round((correctly_generated / total_turns) * 100, 2)
        if not no_print:
            print('Joint Goal Accuracy* (JGA*) = ', jga_star)
        return jga_star
--- a/prompt-learning/prompt_decode.py
+++ b/prompt-learning/prompt_decode.py
@ -169,14 +169,18 @@ def main():
        # add true belief states & generated belief states to outputs
        outputs.append({"history": history,
                        "extracted_values": item['values'],
                        "true_states": true_states,
                        "gen_states": gen_states})
        # add true & generated belief states to evaluator for computing JGA
        evaluator.add_data_item(true_states.copy(), gen_states.copy())
    progress.close()
    # compute JGA & print results
    evaluator.compute_joint_goal_accuracy()
    # compute JGA* & print results (JGA* -> consider values that are extracted correctly)
    evaluator.compute_jga_for_correct_values()
    # output file extension (for prompt ensemble & answered prompts)
    out_ext = "_pe" if args.with_prompt_ensemble else ""
--- a/prompt-learning/prompt_train.py
+++ b/prompt-learning/prompt_train.py
@ -178,8 +178,8 @@ def main():
                loss_count = 0
        # Save the model after finishing an epoch
-        epoch_str = "{:02d}".format(epoch + 1)
+        epoch_str = "epoch-{:02d}".format(epoch + 1)
-        output_dir = os.path.join(args.save_model_dir, '{}-{}'.format("epoch", epoch_str))
+        output_dir = os.path.join(args.save_model_dir, epoch_str)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        # save training args for each epoch (useful when testing/generating)
@ -280,7 +280,15 @@ def save_args(save_dir, args):
        os.makedirs(save_dir)
    args_file = os.path.join(save_dir, 'args.json')
    args_dict = vars(args)
-    args_dict['prompt_templates'] = PROMPT_TEMPLATES
+    # save prompt templates used for training & inference
    prompt_templates = PROMPT_TEMPLATES.copy()
    if not args.with_inverse_prompt:
        prompt_templates.pop(TYPE_INVERSE_PROMPT)
    if args.with_prompt_ensemble:
        prompt_templates.pop(TYPE_VALUE_BASED_PROMPT)
    else:
        prompt_templates.pop(TYPE_PROMPT_ENSEMBLE)
    args_dict['prompt_templates'] = prompt_templates
    json.dump(args_dict, open(args_file, "w"), indent=2)
@ -314,6 +322,36 @@ def train_prompting(args, history, slot_value_pair, prompt_type, tokenizer, mode
    # model outputs
    outputs = model(**encoded_prompt)
    if prompt_type == TYPE_INVERSE_PROMPT:
        # value ids
        value_ids = tokenizer.encode(slot_value_pair[1], return_tensors="pt", add_prefix_space=True)
        flipped_value_ids = torch.flip(value_ids, dims=[1])
        flipped_logits = torch.flip(outputs.logits, dims=[1])
        index = 1
        # iterate through the value ids and compute loss (combined probability)
        total_prob = None
        for item in flipped_value_ids[0]:
            token_logits = flipped_logits[:, index, :]
            index += 1
            # softmax probabilities
            probs = torch.nn.functional.softmax(token_logits, dim=-1)
            # this token generation probability
            token_prob = torch.gather(probs, 1, torch.tensor([[item]], device=device)).squeeze(-1)
            # multiply the probabilities for each word in belief state value
            if total_prob is None:
                total_prob = token_prob
            else:
                total_prob *= token_prob
        loss = torch.negative(torch.log(total_prob))
        # return loss and 'None' for generated values
        return loss, None
    # loss for slot generation using value-based prompt & the generated slot
    if prompt_type == TYPE_VALUE_BASED_PROMPT:
        # get last token logits [-2 -> for last but one item]
        logits = outputs.logits[:, -2, :]
@ -325,9 +363,6 @@ def train_prompting(args, history, slot_value_pair, prompt_type, tokenizer, mode
        loss = torch.negative(torch.log(last_token_prob))
        # generated word -> the one with the highest probability
    generated_word = None
    if prompt_type == TYPE_VALUE_BASED_PROMPT:
        # find the token with the highest probability, this will be the generated word
        gen_word_token = torch.argmax(logits, dim=-1)
        generated_word = tokenizer.decode(gen_word_token, skip_special_tokens=True).strip()
--- a/prompt-learning/prompt_utils.py
+++ b/prompt-learning/prompt_utils.py
@ -12,7 +12,7 @@ PROMPT_TEMPLATES = {
        "generate": "belief states: value = $value, slot ="
    },
    "inverse-prompt": {
-        "training": "belief states: slot = $slot, value = $value",
+        "training": "belief states: $slot = $value",
    },
    "prompt-ensemble": {
        "training": {