Created data for prompt learning from SOLOIST baseline data

main
Pavan Mandava 3 years ago
parent bd815e6a71
commit 5be3215279

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,90 @@
import json
import os
from pathlib import Path
BELIEF_PREFIX = 'belief :'
def create_belief_states_data_for_prompt_learning(data_tuple):
print('creating belief states data for :: ', data_tuple[1], ' => ', data_tuple[2])
# Assertion check for file availability
assert os.path.isfile(data_tuple[0])
data = json.load(open(data_tuple[0]))
print('Opening file => ', data_tuple[0], ' [Size = ', len(data), ']')
if len(data) <= 0:
return
# data to be saved for prompt learning
belief_states_dataset = []
for item in data:
# map to be added to the list for saving
belief_states_data_item = {}
# add the history & domains of dialog to the data item
belief_states_data_item['history'] = item['history']
belief_states_data_item['domains'] = item['domains']
# extract belief states
belief_states = item['belief']
# remove 'belief:' from the beginning
if belief_states.startswith(BELIEF_PREFIX):
belief_states = belief_states[len(BELIEF_PREFIX):]
# belief states can have multiple domains separated by '|'
belief_state_splits = belief_states.split('|')
# contains list of belief state items -> 'slot = value'
belief_slot_value_list = []
for belief_state in belief_state_splits:
if belief_state == '':
continue
if len(belief_state.split()) == 0:
continue
domain = belief_state.split()[0]
if domain == 'none':
continue
# remove domain from belief state
belief_state = ' '.join(belief_state.split()[1:])
# split belief state slot-value pairs
slot_value_list = belief_state.split(';')
for list_item in slot_value_list:
slot_value_list_item = list_item.strip()
if slot_value_list_item == '':
continue
# add the 'slot = value' string to the list
belief_slot_value_list.append(slot_value_list_item)
# add belief states list to data item map (will be saved)
belief_states_data_item['belief_states'] = belief_slot_value_list
# add to the dataset (to be saved!)
belief_states_dataset.append(belief_states_data_item)
# save the dataset file
save_file_path = '../data/prompt-learning/' + data_tuple[2] + '/'
save_file_name = data_tuple[1] + '.soloist.json'
Path(save_file_path).mkdir(parents=True, exist_ok=True)
print('Saving file => ', save_file_path, ' [Size = ', len(belief_states_dataset), ']')
json.dump(belief_states_dataset, open(save_file_path + save_file_name, 'w'), indent=2)
# List contains tuples
# Each tuple has (data filepath, data type, split name)
data_list = [
("../data/baseline/test/test.soloist.json", "test", "test"),
("../data/baseline/valid/valid.soloist.json", "valid", "valid"),
("../data/baseline/50-dpd/train.soloist.json", "train", "50-dpd"),
("../data/baseline/100-dpd/train.soloist.json", "train", "100-dpd"),
("../data/baseline/125-dpd/train.soloist.json", "train", "125-dpd"),
("../data/baseline/250-dpd/train.soloist.json", "train", "250-dpd")
]
for file_tuple in data_list:
create_belief_states_data_for_prompt_learning(data_tuple=file_tuple)
Loading…
Cancel
Save