diff --git a/baseline/__init__.py b/baseline/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/baseline/decode_baseline.sh b/baseline/decode_baseline.sh new file mode 100644 index 0000000..0ec7425 --- /dev/null +++ b/baseline/decode_baseline.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# temp 0.7 - 1.5 +# top_p 0.2 - 0.8 + +# Check whether the required environment vars are set +if [[ -z "${SAVED_MODELS_BASELINE}" ]]; then + echo "SAVED_MODELS_BASELINE Environment variable not set. Run set_env_var.sh bash script" + return +fi + +# Check whether the MODEL_CHECKPOINT env var set +if [[ -z "${MODEL_CHECKPOINT}" ]]; then + echo "MODEL_CHECKPOINT Environment variable not set. Run \"export MODEL_CHECKPOINT=\"" + return +fi + +if [ ! -d "${SAVED_MODELS_BASELINE}/${MODEL_CHECKPOINT}" ]; then + echo "Directory ${MODEL_CHECKPOINT} doesn't exist! Provide a valid Saved Model Checkpoint dir." + return +fi + +NS=5 +TEMP=1 +TOP_P=0.5 +OUTPUT_DIR=${OUTPUTS_DIR_BASELINE}/${MODEL_CHECKPOINT} +mkdir -p "${OUTPUT_DIR}" + +python soloist_decode.py \ +--model_type=gpt2 \ +--model_name_or_path="${SAVED_MODELS_BASELINE}"/"${MODEL_CHECKPOINT}" \ +--num_samples $NS \ +--input_file=../data/baseline/test/test.soloist.json \ +--top_p $TOP_P \ +--temperature $TEMP \ +--output_file="${OUTPUT_DIR}"/output_test.json \ +--max_turn 15 diff --git a/baseline/evaluate.py b/baseline/evaluate.py new file mode 100644 index 0000000..f453df6 --- /dev/null +++ b/baseline/evaluate.py @@ -0,0 +1,122 @@ +import json + +BELIEF_PREFIX = 'belief :' +INVALID_SLOT_VALUES = ["", "dontcare", "not mentioned", "don't care", "dont care", "do n't care", "none"] + + +def create_slot_value_map_from_belief_prediction(belief_prediction): + # remove 'belief:' from the beginning + if belief_prediction.startswith(BELIEF_PREFIX): + belief_prediction = belief_prediction[len(BELIEF_PREFIX):] + + belief_state_splits = belief_prediction.split('|') + + belief_slot_value_map = {} + for belief_state in belief_state_splits: + if belief_state == '': + continue + if len(belief_state.split()) == 0: + continue + domain = belief_state.split()[0] + if domain == 'none': + continue + + # remove domain from belief state + belief_state = ' '.join(belief_state.split()[1:]) + + # split belief state slot-value pairs + slot_value_pairs = belief_state.split(';') + + for slot_value_pair in slot_value_pairs: + if slot_value_pair.strip() == '': + continue + slot_values = slot_value_pair.split(' = ') + if len(slot_values) != 2: + continue + else: + slot, value = slot_values + slot = slot.strip().lower() + value = value.strip().lower() + if value in INVALID_SLOT_VALUES: + continue + belief_slot_value_map[slot] = value + + return belief_slot_value_map + + +class BaselineDSTEvaluator: + def __init__(self, predictions_output_file, evaluation_file): + """ + create an Evaluator object for evaluating Baseline DST predictions + + Args: + predictions_output_file: path of the predictions output JSON file + evaluation_file: path of the test/valid data file for extracting ground truth data + """ + # load predictions output json file + self.predictions = json.load(open(predictions_output_file)) + # load test/valid data json file + self.eval_data = json.load(open(evaluation_file)) + + # do some length checks here + if len(self.predictions) == 0 or len(self.eval_data) == 0: + raise ValueError('Invalid Data (no items) in prediction or evaluation data file!') + + if len(self.predictions) != len(self.eval_data): + raise ValueError('Length mismatch!') + + def parse_prediction_belief_states(self): + + belief_state_list = [] + + for prediction in self.predictions: + last_line = prediction[-1] + last_line = last_line.strip() + + # remove system responses from the predictions + belief_prediction = last_line.split('system :')[0] + + belief_slot_value_map = create_slot_value_map_from_belief_prediction(belief_prediction) + + belief_state_list.append(belief_slot_value_map) + return belief_state_list + + def parse_true_belief_states(self): + + true_belief_state_list = [] + + for item in self.eval_data: + belief_prediction = item['belief'] + + belief_slot_value_map = create_slot_value_map_from_belief_prediction(belief_prediction) + true_belief_state_list.append(belief_slot_value_map) + + return true_belief_state_list + + def compute_joint_goal_accuracy(self, true_states, prediction_states): + print('Computing Joint Goal Accuracy metric...!') + + if len(true_states) != len(prediction_states): + raise ValueError('Length mismatch!') + + correctly_predicted, total_turns = 0, 0 + for truth, prediction in zip(true_states, prediction_states): + total_turns += 1 + + if set(truth.keys()) != set(prediction.keys()): + continue + + for slot in truth: + if truth[slot] != prediction[slot]: + break + correctly_predicted += 1 + + print('Evaluation :: Joint Goal Accuracy = ', (correctly_predicted / total_turns) * 100) + + +evaluator = BaselineDSTEvaluator('../data/outputs/experiment-20220831/125-dpd/checkpoint-90000/output_test.json', + '../data/baseline/test/test.soloist.json') +predicted_belief_states = evaluator.parse_prediction_belief_states() +true_belief_states = evaluator.parse_true_belief_states() +evaluator.compute_joint_goal_accuracy(true_belief_states, predicted_belief_states) + diff --git a/baseline/soloist_decode.py b/baseline/soloist_decode.py new file mode 100644 index 0000000..bb0a295 --- /dev/null +++ b/baseline/soloist_decode.py @@ -0,0 +1,201 @@ +from __future__ import absolute_import, division, print_function, unicode_literals + +import argparse +import logging +from tqdm import trange +import json + +import torch +import torch.nn.functional as F +import numpy as np + +import sys +sys.path.append('.') +sys.path.append('./transformers') +sys.path.append('./transformers/') + +from transformers import GPT2Config +from transformers import GPT2LMHeadModel, GPT2Tokenizer + + +logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt = '%m/%d/%Y %H:%M:%S', + level = logging.INFO) +logger = logging.getLogger(__name__) + +MAX_LENGTH = int(150) # Hardcoded max length to avoid infinite loop + +MODEL_CLASSES = { + 'gpt2': (GPT2LMHeadModel, GPT2Tokenizer) +} + + +def set_seed(args): + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if args.n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + +def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')): + """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering + Args: + logits: logits distribution shape (batch size x vocabulary size) + top_k > 0: keep only top k tokens with highest probability (top-k filtering). + top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). + Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) + From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 + """ + top_k = min(top_k, logits.size(-1)) # Safety check + if top_k > 0: + # Remove all tokens with a probability less than the last token of the top-k + indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] + logits[indices_to_remove] = filter_value + + if top_p > 0.0: + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) + + # Remove tokens with cumulative probability above the threshold + sorted_indices_to_remove = cumulative_probs > top_p + # Shift the indices to the right to keep also the first token above the threshold + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + + # scatter sorted tensors to original indexing + indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove) + logits[indices_to_remove] = filter_value + return logits + + +def sample_sequence(model, length, context, token_type_ids, system_token_id, num_samples=1, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0, device='cpu'): + context = torch.tensor(context, dtype=torch.long, device=device) + context = context.unsqueeze(0).repeat(num_samples, 1) + + token_type_ids = torch.tensor(token_type_ids, dtype=torch.long, device=device) + token_type_ids = token_type_ids.unsqueeze(0).repeat(num_samples, 1) + system_token_id = torch.tensor(system_token_id, dtype=torch.long, device=device) + system_token_id = system_token_id.unsqueeze(0).repeat(num_samples, 1) + generated = context + with torch.no_grad(): + for _ in range(length): + + inputs = {'input_ids': generated, 'token_type_ids':token_type_ids} + outputs = model(**inputs) # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet/CTRL (cached hidden-states) + next_token_logits = outputs[0][:, -1, :] / (temperature if temperature > 0 else 1.) + + # repetition penalty from CTRL (https://arxiv.org/abs/1909.05858) + for i in range(num_samples): + for _ in set(generated[i].tolist()): + next_token_logits[i, _] /= repetition_penalty + + filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) + if temperature == 0: # greedy sampling: + next_token = torch.argmax(filtered_logits, dim=-1).unsqueeze(-1) + else: + next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) + generated = torch.cat((generated, next_token), dim=1) + token_type_ids = torch.cat((token_type_ids, system_token_id), dim=1) + return generated + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model_type", default='gpt2', type=str, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) + parser.add_argument("--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list") + parser.add_argument("--padding_text", type=str, default="") + parser.add_argument("--xlm_lang", type=str, default="", help="Optional language when used with the XLM model.") + parser.add_argument("--length", type=int, default=110) + parser.add_argument("--num_samples", type=int, default=1) + parser.add_argument("--temperature", type=float, default=1.0, help="temperature of 0 implies greedy sampling") + parser.add_argument("--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2") + parser.add_argument("--top_k", type=int, default=0) + parser.add_argument("--top_p", type=float, default=0.9) + parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") + parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") + parser.add_argument('--stop_token', type=str, default='<|endoftext|>', help="Token at which text generation is stopped") + parser.add_argument('--input_file', type=str, default=None, help="input json file to decoding") + parser.add_argument('--output_file', type=str, default=None, help="save path") + parser.add_argument('--max_turn', type=int, default=15, help="number of turns used as context") + + + args = parser.parse_args() + + # setup CUDA device + args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + args.n_gpu = torch.cuda.device_count() + + set_seed(args) + + # setup HuggingFace Model + args.model_type = args.model_type.lower() + model_class, tokenizer_class = MODEL_CLASSES[args.model_type] + tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) + model = model_class.from_pretrained(args.model_name_or_path) + model.to(args.device) + model.eval() + + if args.length < 0 and model.config.max_position_embeddings > 0: + args.length = model.config.max_position_embeddings + elif 0 < model.config.max_position_embeddings < args.length: + args.length = model.config.max_position_embeddings # No generation bigger than model size + elif args.length < 0: + args.length = MAX_LENGTH # avoid infinite loop + + logger.info(args) + inputs = json.load(open(args.input_file)) + output_tests = [] + system_token_id = tokenizer.convert_tokens_to_ids(['system']) + user_token_id = tokenizer.convert_tokens_to_ids(['user']) + + for idx in range(len(inputs)): + logger.info(f"PROGRESS: {int(idx/len(inputs)*100)}%") + example = inputs[idx] + history = example['history'] + context = history[-args.max_turn:] + context_ids = [] + token_ids_for_context = [] + for cxt in context: + ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(cxt)) + context_ids += ids + if 'user :' in cxt: + token_ids_for_context += user_token_id * len(ids) + else: + token_ids_for_context += system_token_id * len(ids) + + response = '=>' + response_id = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(response)) + + context_tokens = context_ids + response_id + token_type_ids = token_ids_for_context + system_token_id + + assert( len(context_tokens) == len(token_type_ids)) + + out = sample_sequence( + model=model, + context=context_tokens, + token_type_ids=token_type_ids, + system_token_id=system_token_id, + num_samples=args.num_samples, + length=args.length, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + repetition_penalty=args.repetition_penalty, + device=args.device, + ) + out = out[:, len(context_tokens):].tolist() + examples = [] + for o in out: + text = tokenizer.decode(o, clean_up_tokenization_spaces=True) + text = text[: text.find(args.stop_token) if args.stop_token else None] + examples.append(text) + + output_tests.append(examples) + print(output_tests) + json.dump(output_tests, open(args.output_file,'w'), indent=2) + return text + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/baseline/soloist_train.py b/baseline/soloist_train.py new file mode 100644 index 0000000..343c2cf --- /dev/null +++ b/baseline/soloist_train.py @@ -0,0 +1,739 @@ +from __future__ import absolute_import, division, print_function + +import argparse +import glob +import logging +import os +import pickle +import random +import re +import shutil +import time +import json + +import sys +sys.path.append('.') +sys.path.append('./transformers') +sys.path.append('./transformers/') + +import numpy as np +import torch +from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler +from torch.utils.data.distributed import DistributedSampler + +try: + from torch.utils.tensorboard import SummaryWriter +except: + from tensorboardX import SummaryWriter + +from tqdm import tqdm, trange + +from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,GPT2Config, GPT2DoubleHeadsModel, GPT2Tokenizer) + + +logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt = '%m/%d/%Y %H:%M:%S', + level = logging.INFO) +logger = logging.getLogger(__name__) + + +MODEL_CLASSES = { + 'gpt2': (GPT2Config, GPT2DoubleHeadsModel, GPT2Tokenizer) +} +s = 0 +s_time = time.time() +def log_every_n_interval(interval, msg): + global s, s_time + s += 1 + if s % interval == 0: + if s_time is None: + s_time = time.time() + iter_p_s = 0 + else: + e_time = time.time() + elapse = e_time - s_time + iter_p_s = interval / elapse + s_time = e_time + logger.info(f'MSG: {msg}; ITER: {iter_p_s:.2f}/s') + +class JsonDataset(Dataset): + def __init__(self, tokenizer, args, file_path='train', max_seq=80, max_turn=1, seperator=' & '): + assert os.path.isfile(file_path) + directory, filename = os.path.split(file_path) + cached_features_file = os.path.join(directory, args.output_dir + '_cached_lm' + '_seqlen_' + str(max_seq) + '_' + filename) + + if os.path.exists(cached_features_file) and not args.overwrite_cache: + logger.info("Loading features from cached file %s", cached_features_file) + with open(cached_features_file, 'rb') as handle: + self.examples = pickle.load(handle) + else: + logger.info(f"Creating features from dataset file at {directory}") + + self.examples = [] + self.labels = [] + self.token_ids = [] + self.attention_masks = [] + + self.mc_token_ids = [] + self.mc_labels = [] + + system_token_id = tokenizer.convert_tokens_to_ids(['system']) + user_token_id = tokenizer.convert_tokens_to_ids(['user']) + induction_token_id = tokenizer.convert_tokens_to_ids(['=>']) + + examples = json.load(open(file_path)) + + response_pool = [] + belief_pool = [] + idxs = list(range(len(examples))) + for i in examples: + response = i['reply'] + ' '+tokenizer.eos_token + response_id = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(response)) + response_pool.append(response_id) + + belief_id = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(i['belief'])) + belief_pool.append(belief_id) + + for example in examples: + history = example['history'] + context = history[-max_turn:] + context_ids = [] + token_ids_for_context = [] + for cxt in context: + ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(cxt)) + context_ids += ids + if 'user :' in cxt: + token_ids_for_context += user_token_id * len(ids) + else: + token_ids_for_context += system_token_id * len(ids) + + history = ' '.join(history[-max_turn:]) + kb = ' '#example['kb'] + belief = example['belief'] + + belief_id = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(belief)) + response = example['reply'] + ' '+tokenizer.eos_token + response_id = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(response)) + + token_id = token_ids_for_context + system_token_id + system_token_id * len(belief_id) + system_token_id * len(response_id) + source = context_ids + induction_token_id + belief_id + response_id + if args.with_LM: + target = source + else: + target = [-1] * len(context_ids) + [-1] * len(induction_token_id) + belief_id + response_id + + + if len(source) < max_seq: + attention_mask = [0] * max_seq + attention_mask[:len(source)] = [1] * len(source) + self.mc_token_ids.append(len(source) - 1) + source += [0] * (max_seq - len(source)) + target += [-1] * (max_seq - len(target)) + token_id += [0] * (max_seq - len(token_id)) + else: + attention_mask = [1] * max_seq + self.mc_token_ids.append(max_seq - 1) + source = source[-max_seq:] + target = target[-max_seq:] + token_id = token_id[-max_seq:] + + self.mc_labels.append(0) + + if not len(source) == len(target) == len(token_id) == len(attention_mask): + import pdb + pdb.set_trace() + + self.examples.append(source) + self.labels.append(target) + self.token_ids.append(token_id) + self.attention_masks.append(attention_mask) + + if args.add_same_belief_response_prediction: + for _ in range(args.num_candidates): + + random_idx = random.choice(idxs) + new_response_id = response_pool[random_idx] + new_belief_id = belief_pool[random_idx] + + source = context_ids + induction_token_id + new_belief_id + new_response_id + token_id = token_ids_for_context + system_token_id + system_token_id * len(new_belief_id) + system_token_id * len(new_response_id) + if len(source) < max_seq: + attention_mask = [0] * max_seq + attention_mask[:len(source)] = [1] * len(source) + self.mc_token_ids.append(len(source)-1) + source += [0] * (max_seq - len(source)) + token_id += [0] * (max_seq - len(token_id)) + else: + attention_mask = [1] * max_seq + self.mc_token_ids.append(max_seq - 1) + source = source[-max_seq:] + target = target[-max_seq:] + token_id = token_id[-max_seq:] + + + self.examples.append(source) + self.labels.append([-1] * len(source)) + self.token_ids.append(token_id) + self.mc_labels.append(1) + self.attention_masks.append(attention_mask) + + if args.add_response_prediction: + for _ in range(args.num_candidates): + + random_idx = random.choice(idxs) + new_response_id = response_pool[random_idx] + new_belief_id = belief_pool[random_idx] + + source = context_ids + induction_token_id + belief_id + new_response_id + token_id = token_ids_for_context + system_token_id + system_token_id * len(belief_id) + system_token_id * len(new_response_id) + if len(source) < max_seq: + attention_mask = [0] * max_seq + attention_mask[:len(source)] = [1] * len(source) + self.mc_token_ids.append(len(source)-1) + source += [0] * (max_seq - len(source)) + token_id += [0] * (max_seq - len(token_id)) + else: + attention_mask = [1] * max_seq + self.mc_token_ids.append(max_seq - 1) + source = source[-max_seq:] + target = target[-max_seq:] + token_id = token_id[-max_seq:] + + + self.examples.append(source) + self.labels.append([-1] * len(source)) + self.token_ids.append(token_id) + self.mc_labels.append(1) + self.attention_masks.append(attention_mask) + + if args.add_belief_prediction: + for _ in range(args.num_candidates): + random_idx = random.choice(idxs) + new_response_id = response_pool[random_idx] + new_belief_id = belief_pool[random_idx] + + source = context_ids + induction_token_id + new_belief_id + response_id + token_id = token_ids_for_context + system_token_id + system_token_id * len(new_belief_id) + system_token_id * len(response_id) + if len(source) < max_seq: + attention_mask = [0] * max_seq + attention_mask[:len(source)] = [1] * len(source) + self.mc_token_ids.append(len(source)-1) + source += [0] * (max_seq - len(source)) + token_id += [0] * (max_seq - len(token_id)) + else: + attention_mask = [1] * max_seq + self.mc_token_ids.append(max_seq - 1) + source = source[-max_seq:] + target = target[-max_seq:] + token_id = token_id[-max_seq:] + + + self.examples.append(source) + self.labels.append([-1] * len(source)) + self.token_ids.append(token_id) + self.mc_labels.append(1) + self.attention_masks.append(attention_mask) + def __len__(self): + return len(self.examples) + + def __getitem__(self, item): + return torch.tensor(self.examples[item]), torch.tensor(self.token_ids[item]), torch.tensor(self.labels[item]), torch.tensor(self.attention_masks[item]), torch.tensor(self.mc_labels[item]), torch.tensor(self.mc_token_ids[item]), + +def load_and_cache_examples(args, tokenizer, evaluate=False): + dataset = JsonDataset(tokenizer, args, file_path=args.eval_data_file if evaluate else args.train_data_file, max_seq=args.max_seq, max_turn=args.max_turn) + return dataset + +def set_seed(args): + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if args.n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + +def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False): + if not args.save_total_limit: + return + if args.save_total_limit <= 0: + return + + # Check if we should delete older checkpoint(s) + glob_checkpoints = glob.glob(os.path.join(args.output_dir, '{}-*'.format(checkpoint_prefix))) + if len(glob_checkpoints) <= args.save_total_limit: + return + + ordering_and_checkpoint_path = [] + for path in glob_checkpoints: + if use_mtime: + ordering_and_checkpoint_path.append((os.path.getmtime(path), path)) + else: + regex_match = re.match('.*{}-([0-9]+)'.format(checkpoint_prefix), path) + if regex_match and regex_match.groups(): + ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path)) + + checkpoints_sorted = sorted(ordering_and_checkpoint_path) + checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted] + number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit) + checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete] + for checkpoint in checkpoints_to_be_deleted: + logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint)) + shutil.rmtree(checkpoint) + + +def mask_tokens(inputs, tokenizer, args): + """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ + labels = inputs.clone() + # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) + probability_matrix = torch.full(labels.shape, args.mlm_probability) + special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()] + probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) + masked_indices = torch.bernoulli(probability_matrix).bool() + labels[~masked_indices] = -1 # We only compute loss on masked tokens + + # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) + indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices + inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) + + # 10% of the time, we replace masked input tokens with random word + indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced + random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long) + inputs[indices_random] = random_words[indices_random] + + # The rest of the time (10% of the time) we keep the masked input tokens unchanged + return inputs, labels + + +def train(args, train_dataset, model, tokenizer): + """ Train the model """ + if args.local_rank in [-1, 0]: + tb_writer = SummaryWriter() + + args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) + train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) + train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) + + if args.max_steps > 0: + t_total = args.max_steps + args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 + else: + t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs + + # Prepare optimizer and schedule (linear warmup and decay) + no_decay = ['bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, + {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) + if args.fp16: + try: + from apex import amp + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") + model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) + model.resize_token_embeddings(len(tokenizer)) + # multi-gpu training (should be after apex fp16 initialization) + if args.n_gpu > 1: + model = torch.nn.DataParallel(model) + + # Distributed training (should be after apex fp16 initialization) + if args.local_rank != -1: + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], + output_device=args.local_rank, + find_unused_parameters=True) + + # Train! + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_dataset)) + logger.info(" Num Epochs = %d", args.num_train_epochs) + logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) + logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", + args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) + logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) + logger.info(" Total optimization steps = %d", t_total) + + global_step = 0 + tr_loss, logging_loss = 0.0, 0.0 + model.zero_grad() + train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) + set_seed(args) # Added here for reproducibility (even between python 2 and 3) + for e in train_iterator: + # epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) + for step, batch in enumerate(train_dataloader): + # inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) + # logger.info(f" PROGRESS: {float(global_step)/t_total*100}%") + log_every_n_interval(500, f" PROGRESS: {int(float(global_step)/t_total*100)}%") + if step % 500 == 0: + logger.info(f" PROGRESS: {int(float(global_step)/t_total*100)}%") + inputs, tokens, labels, masks,mc_labels, mc_token_ids = batch + + inputs = inputs.to(args.device) + tokens = tokens.to(args.device) + labels = labels.to(args.device) + masks = masks.to(args.device) + mc_labels = mc_labels.to(args.device) + mc_token_ids = mc_token_ids.to(args.device) + + model.train() + outputs = model(inputs, lm_labels=labels, mc_labels=mc_labels, mc_token_ids=mc_token_ids, token_type_ids=tokens, attention_mask=masks) + lm_loss = outputs[0] # model outputs are always tuple in transformers (see doc) + mc_loss = outputs[1] + + loss = lm_loss + args.mc_loss_efficient * mc_loss + + if args.n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu parallel training + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + + if args.fp16: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + + tr_loss += loss.item() + if (step + 1) % args.gradient_accumulation_steps == 0: + if args.fp16: + torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) + else: + torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) + optimizer.step() + scheduler.step() # Update learning rate schedule + model.zero_grad() + global_step += 1 + + if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: + # Log metrics + if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well + results = evaluate(args, model, tokenizer) + for key, value in results.items(): + tb_writer.add_scalar('eval_{}'.format(key), value, global_step) + tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) + tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) + logger.info(f" EVALERR: {(tr_loss - logging_loss)/float(args.logging_steps)}") + logging_loss = tr_loss + + if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: + checkpoint_prefix = 'checkpoint' + # Save model checkpoint + output_dir = os.path.join(args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step)) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save.save_pretrained(output_dir) + tokenizer.save_pretrained(output_dir) + torch.save(args, os.path.join(output_dir, 'training_args.bin')) + logger.info("Saving model checkpoint to %s", output_dir) + + _rotate_checkpoints(args, checkpoint_prefix) + + # if args.max_steps > 0 and global_step > args.max_steps: + # epoch_iterator.close() + # break + if args.max_steps > 0 and global_step > args.max_steps: + train_iterator.close() + break + + if args.local_rank in [-1, 0]: + tb_writer.close() + + return global_step, tr_loss / global_step + + +def evaluate(args, model, tokenizer, prefix=""): + # Loop to handle MNLI double evaluation (matched, mis-matched) + eval_output_dir = args.output_dir + + eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) + + if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: + os.makedirs(eval_output_dir) + + args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) + # Note that DistributedSampler samples randomly + eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) + + # multi-gpu evaluate + if args.n_gpu > 1: + model = torch.nn.DataParallel(model) + + # Eval! + logger.info("***** Running evaluation {} *****".format(prefix)) + logger.info(" Num examples = %d", len(eval_dataset)) + logger.info(" Batch size = %d", args.eval_batch_size) + eval_loss = 0.0 + nb_eval_steps = 0 + model.eval() + + for batch in tqdm(eval_dataloader, desc="Evaluating"): + # inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) + + inputs, tokens, labels, masks = batch + # import pdb + # pdb.set_trace() + inputs = inputs.to(args.device) + tokens = tokens.to(args.device) + labels = labels.to(args.device) + masks = masks.to(args.device) + # inputs = inputs.to(args.device) + # labels = labels.to(args.device) + + with torch.no_grad(): + outputs = model(inputs, masked_lm_labels=labels, token_type_ids=tokens) if args.mlm else model(inputs, labels=labels) + lm_loss = outputs[0] + eval_loss += lm_loss.mean().item() + nb_eval_steps += 1 + + eval_loss = eval_loss / nb_eval_steps + perplexity = torch.exp(torch.tensor(eval_loss)) + + result = { + "perplexity": perplexity + } + + output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") + with open(output_eval_file, "w") as writer: + logger.info("***** Eval results {} *****".format(prefix)) + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + return result + + +def main(): + parser = argparse.ArgumentParser() + + ## Required parameters + parser.add_argument("--train_data_file", default=None, type=str, required=True, + help="The input training data file (a text file).") + parser.add_argument("--output_dir", default=None, type=str, required=True, + help="The output directory where the model predictions and checkpoints will be written.") + + ## Other parameters + parser.add_argument("--eval_data_file", default=None, type=str, + help="An optional input evaluation data file to evaluate the perplexity on (a text file).") + + parser.add_argument("--model_type", default="bert", type=str, + help="The model architecture to be fine-tuned.") + parser.add_argument("--model_name_or_path", default="bert-base-cased", type=str, + help="The model checkpoint for weights initialization.") + + parser.add_argument("--mlm", action='store_true', + help="Train with masked-language modeling loss instead of language modeling.") + parser.add_argument("--mlm_probability", type=float, default=0.15, + help="Ratio of tokens to mask for masked language modeling loss") + + parser.add_argument("--config_name", default="", type=str, + help="Optional pretrained config name or path if not the same as model_name_or_path") + parser.add_argument("--tokenizer_name", default="", type=str, + help="Optional pretrained tokenizer name or path if not the same as model_name_or_path") + parser.add_argument("--cache_dir", default="", type=str, + help="Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)") + parser.add_argument("--block_size", default=80, type=int, + help="Optional input sequence length after tokenization." + "The training dataset will be truncated in block of this size for training." + "Default to the model max input length for single sentence inputs (take into account special tokens).") + parser.add_argument("--do_train", action='store_true', + help="Whether to run training.") + parser.add_argument("--do_eval", action='store_true', + help="Whether to run eval on the dev set.") + parser.add_argument("--evaluate_during_training", action='store_true', + help="Run evaluation during training at each logging step.") + parser.add_argument("--do_lower_case", action='store_true', + help="Set this flag if you are using an uncased model.") + + parser.add_argument("--per_gpu_train_batch_size", default=1, type=int, + help="Batch size per GPU/CPU for training.") + parser.add_argument("--per_gpu_eval_batch_size", default=1, type=int, + help="Batch size per GPU/CPU for evaluation.") + parser.add_argument('--gradient_accumulation_steps', type=int, default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.") + parser.add_argument("--learning_rate", default=5e-5, type=float, + help="The initial learning rate for Adam.") + parser.add_argument("--weight_decay", default=0.0, type=float, + help="Weight deay if we apply some.") + parser.add_argument("--adam_epsilon", default=1e-8, type=float, + help="Epsilon for Adam optimizer.") + parser.add_argument("--max_grad_norm", default=1.0, type=float, + help="Max gradient norm.") + parser.add_argument("--num_train_epochs", default=1.0, type=float, + help="Total number of training epochs to perform.") + parser.add_argument("--max_steps", default=-1, type=int, + help="If > 0: set total number of training steps to perform. Override num_train_epochs.") + parser.add_argument("--warmup_steps", default=0, type=int, + help="Linear warmup over warmup_steps.") + + parser.add_argument('--logging_steps', type=int, default=10, + help="Log every X updates steps.") + parser.add_argument('--save_steps', type=int, default=5000, + help="Save checkpoint every X updates steps.") + parser.add_argument('--save_total_limit', type=int, default=None, + help='Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default') + parser.add_argument("--eval_all_checkpoints", action='store_true', + help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number") + parser.add_argument("--no_cuda", action='store_true', + help="Avoid using CUDA when available") + parser.add_argument('--overwrite_output_dir', action='store_true', + help="Overwrite the content of the output directory") + parser.add_argument('--overwrite_cache', action='store_true', + help="Overwrite the cached training and evaluation sets") + parser.add_argument('--seed', type=int, default=42, + help="random seed for initialization") + + parser.add_argument('--fp16', action='store_true', + help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") + parser.add_argument('--fp16_opt_level', type=str, default='O1', + help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." + "See details at https://nvidia.github.io/apex/amp.html") + parser.add_argument("--local_rank", type=int, default=-1, + help="For distributed training: local_rank") + parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") + parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") + parser.add_argument('--text_chunk', action='store_true', help="") + parser.add_argument('--with_LM', type=bool, default=True, help="") + + parser.add_argument("--max_seq", default=200, type=int, help="") + parser.add_argument("--max_turn", default=1, type=int, help="") + parser.add_argument("--mc_loss_efficient", default=1, type=float, help="") + parser.add_argument("--num_candidates", default=1, type=int, help="") + parser.add_argument("--add_special_action_tokens", default='', type=str) + parser.add_argument("--add_same_belief_response_prediction", action='store_true') + parser.add_argument("--add_response_prediction", action='store_true') + parser.add_argument("--add_belief_prediction", action='store_true') + + args = parser.parse_args() + + if args.model_type in ["bert", "roberta", "distilbert"] and not args.mlm: + raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm " + "flag (masked language modeling).") + if args.eval_data_file is None and args.do_eval: + raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " + "or remove the --do_eval argument.") + + if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: + raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) + + # Setup distant debugging if needed + if args.server_ip and args.server_port: + # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script + import ptvsd + print("Waiting for debugger attach") + ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) + ptvsd.wait_for_attach() + + # Setup CUDA, GPU & distributed training + if args.local_rank == -1 or args.no_cuda: + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + args.n_gpu = torch.cuda.device_count() + else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + torch.distributed.init_process_group(backend='nccl') + args.n_gpu = 1 + args.device = device + + # Setup logging + logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt = '%m/%d/%Y %H:%M:%S', + level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) + logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) + + # Set seed + set_seed(args) + + # Load pretrained model and tokenizer + if args.local_rank not in [-1, 0]: + torch.distributed.barrier() # Barrier to make sure only the first process in distributed training download model & vocab + + config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] + config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, + cache_dir=args.cache_dir if args.cache_dir else None) + config.num_labels = 2 + tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, + do_lower_case=args.do_lower_case, + cache_dir=args.cache_dir if args.cache_dir else None) + if args.block_size <= 0: + args.block_size = tokenizer.max_len_single_sentence # Our input block size will be the max possible for the model + args.block_size = min(args.block_size, tokenizer.max_len_single_sentence) + model = model_class.from_pretrained(args.model_name_or_path, + from_tf=bool('.ckpt' in args.model_name_or_path), + config=config, + cache_dir=args.cache_dir if args.cache_dir else None) + + if args.add_special_action_tokens: + special_tokens = [] + for line in open(args.add_special_action_tokens): + special_tokens.append(line.strip()) + tokenizer.add_tokens(special_tokens) + model.resize_token_embeddings(len(tokenizer)) + model.to(args.device) + + if args.local_rank == 0: + torch.distributed.barrier() # End of barrier to make sure only the first process in distributed training download model & vocab + + logger.info("Training/evaluation parameters %s", args) + + # Training + if args.do_train: + if args.local_rank not in [-1, 0]: + torch.distributed.barrier() # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache + + train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False) + + if args.local_rank == 0: + torch.distributed.barrier() + + global_step, tr_loss = train(args, train_dataset, model, tokenizer) + logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) + + + # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained() + if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): + # Create output directory if needed + if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: + os.makedirs(args.output_dir) + + logger.info("Saving model checkpoint to %s", args.output_dir) + # Save a trained model, configuration and tokenizer using `save_pretrained()`. + # They can then be reloaded using `from_pretrained()` + model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save.save_pretrained(args.output_dir) + tokenizer.save_pretrained(args.output_dir) + + # Good practice: save your training arguments together with the trained model + torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) + + # Load a trained model and vocabulary that you have fine-tuned + model = model_class.from_pretrained(args.output_dir) + tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) + model.to(args.device) + + + # Evaluation + results = {} + if args.do_eval and args.local_rank in [-1, 0]: + checkpoints = [args.output_dir] + if args.eval_all_checkpoints: + checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) + logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging + logger.info("Evaluate the following checkpoints: %s", checkpoints) + for checkpoint in checkpoints: + global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" + prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" + + model = model_class.from_pretrained(checkpoint) + model.to(args.device) + result = evaluate(args, model, tokenizer, prefix=prefix) + result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) + results.update(result) + + return results + + +if __name__ == "__main__": + main() diff --git a/baseline/train_baseline.sh b/baseline/train_baseline.sh new file mode 100644 index 0000000..b5ce0dd --- /dev/null +++ b/baseline/train_baseline.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# lr 1e-5 to 5e-5 +# mc_loss_efficient 0.1 to 1 + +while getopts d: flag +do + case "${flag}" in + d) data_split=${OPTARG};; + esac +done + +# Check whether the required environment vars are set +if [[ -z "${SAVED_MODELS_BASELINE}" ]] || [[ -z "${PRE_TRAINED_SOLOIST}" ]]; then + echo "Required Environment variables not set. First run set_env_var.sh" +fi + +datetime_now=$(date +"%Y%m%d") +experiment_folder=experiment-${datetime_now}/"${data_split}" +python soloist_train.py \ +--output_dir="${SAVED_MODELS_BASELINE}"/"${experiment_folder}" \ +--model_type=gpt2 \ +--model_name_or_path="${PRE_TRAINED_SOLOIST}" \ +--do_train \ +--train_data_file=../data/baseline/"${data_split}"/train.soloist.json \ +--eval_data_file=../data/baseline/valid/valid.soloist.json \ +--add_special_action_tokens=../data/resource/special_tokens.txt \ +--per_gpu_train_batch_size 1 \ +--num_train_epochs 25 \ +--learning_rate 5e-5 \ +--overwrite_cache \ +--save_steps 5000 \ +--max_seq 100 \ +--overwrite_output_dir \ +--max_turn 15 \ +--num_candidates 1 \ +--mc_loss_efficient 0.33 \ +--add_belief_prediction \ No newline at end of file diff --git a/common/__init__.py b/common/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/set_env.sh b/set_env.sh new file mode 100644 index 0000000..40a0dcb --- /dev/null +++ b/set_env.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# Path for storing baseline saved models +export SAVED_MODELS_BASELINE=/mount/studenten/projects/mandavsi/baseline/trained_models +# path for storing test outputs +export OUTPUTS_DIR_BASELINE=/mount/studenten/projects/mandavsi/baseline/outputs + +# path of pretrained SOLOIST model +export PRE_TRAINED_SOLOIST=/mount/studenten/projects/mandavsi/soloist-pretrained/gtg_pretrained + +# create dirs if not exist +mkdir -p ${SAVED_MODELS_BASELINE} +mkdir -p ${OUTPUTS_DIR_BASELINE} +mkdir -p ${PRE_TRAINED_SOLOIST} \ No newline at end of file diff --git a/unset_env.sh b/unset_env.sh new file mode 100644 index 0000000..5d2f777 --- /dev/null +++ b/unset_env.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# run this script to unset env variables +unset SAVED_MODELS_BASELINE +unset PRE_TRAINED_SOLOIST +unset OUTPUTS_DIR_BASELINE \ No newline at end of file