From 2d8e82ba5114442f352b4a347187ec4b49e65426 Mon Sep 17 00:00:00 2001 From: Pavan Mandava Date: Sun, 4 Dec 2022 19:32:57 +0100 Subject: [PATCH] Updated README (added Value Extraction) --- ANALYSIS.md | 3 +++ README.md | 35 +++++++++++++++++++++++++---------- utils/corenlp.py | 2 +- 3 files changed, 29 insertions(+), 11 deletions(-) create mode 100644 ANALYSIS.md diff --git a/ANALYSIS.md b/ANALYSIS.md new file mode 100644 index 0000000..db39dbb --- /dev/null +++ b/ANALYSIS.md @@ -0,0 +1,3 @@ +# Analysis of results and outputs + +// TODO \ No newline at end of file diff --git a/README.md b/README.md index d759c1c..4ebe702 100644 --- a/README.md +++ b/README.md @@ -180,15 +180,6 @@ python evaluate.py ## Prompt Learning Experiments -### Data -The data for training the prompt learning model is available under [data/prompt-learning](data/prompt-learning) directory. -`create_dataset.py` ([link](utils/create_dataset.py)) has the scripts for converting/creating the data for training the prompt-based model. - -> **Note:** -> Running `create_dataset.py` can take some time as it needs to download, install and run Stanford CoreNLP `stanza` package. This scripts downloads coreNLP files of size `~1GB` and requires significant amount of RAM and processor capabilities to run efficiently. -> -> All the data required for training the prompt-based model is already available under the [data](data) directory of this repo. - ### Install the requirements After following the environment setup steps in the previous [section](#environment-setup), install the required python modules for prompt model training. @@ -198,6 +189,26 @@ cd prompt-learning pip install -r requirements.txt ``` +### Data +The data for training the prompt learning model is available under [data/prompt-learning](data/prompt-learning) directory. +`create_dataset.py` ([link](utils/create_dataset.py)) has the scripts for converting/creating the data for training the prompt-based model. + +### Value Extraction +Value candidates are extracted from the user dialog history and are utilized in the testing/inference phase. These extracted values are given to the value-based prompt for generating slots at inference time. Stanford CoreNLP (`stanza` package) is used to first extract POS tags and named entities. A set of rules are used to extract values from POS tags and named entities: + - Adjectives (`JJ`) and Adverbs (`RB`) are considered as possible values + - Example: *expensive*, *moderate* + - Consider previous negator `not` + - Example: *not important* (= dont care) + - Named entities (place names, time, date/day, numbers) + - Example: *08:30*, *friday* + - Custom set of Regex NER rules for recognizing named entities + - Stop words and repeated candidate values are filtered out + +> **Note:** +> Running `create_dataset.py` can take some time as it needs to download, install and run Stanford CoreNLP `stanza` package. This script also downloads coreNLP files of size about `~1GB` and requires significant amount of RAM and processor capabilities to run this efficiently. +> +> All the data required for training the prompt-based model is already available under the [data](data) directory of this repo. For reproducing the results, it's not required to run this script. + ### Train the prompt model Train a separate model for each data split. Edit the [train_prompting.sh](prompt-learning/train_prompting.sh) file to modify the default hyperparameters for training (learning rate, epochs). ```shell @@ -254,7 +265,7 @@ python evaluate.py -o path/to/outputs/file | 125-dpd | 46.49 | 91.86 | | 250-dpd | 47.06 | 92.08 | -> **Note:** All the generated output files for the above reported results are available in the repository. Check [outputs/prompt-learning](outputs/prompt-learning) directory to see the output JSON files for each data-split. +> **Note:** All the generated output files for the above reported results are available in this repository. Check [outputs/prompt-learning](outputs/prompt-learning) directory to see the output JSON files for each data-split. ## Multi-prompt Learning Experiments @@ -313,3 +324,7 @@ sh test_prompting.sh -m > **Note:** All the generated output files for the above reported results are available in this repository. Check [outputs/multi-prompt](outputs/multi-prompt) directory to see the output JSON files for each data-split. + +## Analysis + +Analyses of the results and belief state generations (outputs) can be found [here](ANALYSIS.md). \ No newline at end of file diff --git a/utils/corenlp.py b/utils/corenlp.py index 8d2f92e..f5a5437 100644 --- a/utils/corenlp.py +++ b/utils/corenlp.py @@ -1 +1 @@ -import stanza import os from pathlib import Path from stanza.server import CoreNLPClient STOPWORDS_FILE = "../data/resource/stopwords.txt" CORENLP_DIR = str(Path().absolute()) + '/corenlp-dir' # properties for the CoreNLP Server ANNOTATORS = ['tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'regexner'] # install/download the latest version of CoreNLP stanza.install_corenlp(dir=CORENLP_DIR) extra_model_jar = os.path.join(CORENLP_DIR, 'stanford-corenlp-4.5.1-models-english-extra.jar') if not os.path.isfile(extra_model_jar): # download corenlp english models stanza.download_corenlp_models(model='english-extra', version='4.5.1', dir=CORENLP_DIR) else: print('English Extra CoreNLP models available!') # set environment var of installation location os.environ["CORENLP_HOME"] = CORENLP_DIR VALUES_CONVERT = { 'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10', 'not important': 'dont care', 'not expensive': 'cheap' } # corenlp properties properties = { 'ner.applyFineGrained': False, 'pos.model': 'edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger', 'ner.model': 'edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz,' 'edu/stanford/nlp/models/ner/english.muc.7class.caseless.distsim.crf.ser.gz,' 'edu/stanford/nlp/models/ner/english.conll.4class.caseless.distsim.crf.ser.gz', 'ner.additional.regexner.mapping': str(Path().absolute()) + '/regexner.rules' } DATE_TIME_NUM = {'TIME', 'DATE', 'NUMBER'} class ValueExtractor: def __init__(self): # create the CoreNLP client self.client = CoreNLPClient( annotators=ANNOTATORS, properties=properties, timeout=300000, memory='8G', threads=8, be_quiet=True ) # load the stopwords.txt file self.stopwords = set() with open(STOPWORDS_FILE, 'r') as file: for line in file: self.stopwords.add(line.strip()) def start(self): self.client.start() def stop(self): self.client.stop() def extract_value_candidates(self, history): # format history and prepare text for annotation user_utterance = '' for utterance in history: # consider only user utterances if utterance.startswith('user :'): utterance = utterance[len('user :'):] # if sentence doesn't end with punctuation, add . if not utterance.endswith(('.', '?', '!')): utterance = utterance + '.' # append all utterances user_utterance = user_utterance + utterance + ' ' if not user_utterance: return [] value_candidates = [] user_utterance = user_utterance.replace("'", "") # use corenlp client and annotate text annotation = self.client.annotate(user_utterance) date_day = '' for sent in annotation.sentence: prev_word = '' for token in sent.token: # TODO :: remove # print("{:12s}\t{:12s}\t{:6s}\t{}".format(token.word, token.lemma, token.pos, token.ner)) word = token.word.strip() # extract Adjectives & Adverbs using POS tags # exclude the custom entity types if token.pos in ['JJ', 'RB'] \ and token.ner not in ['AREA', 'PLACE', 'PLACE_TYPE', 'FOOD_TYPE']: # check if the word ends with 'ly' word = word.removesuffix('ly') if word not in self.stopwords and word not in value_candidates: if prev_word == 'not' and prev_word + ' ' + word in VALUES_CONVERT: word = VALUES_CONVERT[prev_word + ' ' + word] value_candidates.append(word) prev_word = word # extract day, time & numbers if token.ner in DATE_TIME_NUM: if word in VALUES_CONVERT: word = VALUES_CONVERT[word] if token.ner == "DATE" and token.pos not in ["NNP", "CD"]: continue if token.ner == "DATE" and token.pos == "NNP": date_day = word continue if word not in self.stopwords: if token.ner == "TIME": if not word[0].isdigit(): continue if len(word) == 4: word = word.zfill(5) if word in value_candidates: continue value_candidates.append(word) # add day/date to value candidates if date_day: value_candidates.append(date_day) entity_map = {} # extract named entities (place, area,...) for sent in annotation.sentence: for mention in sent.mentions: # TODO :: remove # print("{:30s}\t{}".format(mention.entityMentionText, mention.entityType)) entity_text = mention.entityMentionText.strip() if mention.entityType not in {'TIME', 'DATE', 'NUMBER', 'DURATION'} \ and entity_text not in value_candidates: if mention.entityType in ['AREA', 'FOOD_TYPE', 'PLACE_TYPE']: entity_map[mention.entityType] = entity_text else: value_candidates.append(entity_text) more_values = list(entity_map.values()) return value_candidates + more_values # sample test TODO:: remove # dialog_history = [ # "user : i need a taxi to go to mahal of cambridge", # "system : i can help with that . did you have a specific time in mind ?", # "user : yes, i'd like to leave after 2:30 please.", # "system : where would you like to depart from ?", # "user : i am departing from jesus green outdoor pool.", # "system : your taxi is booked . it is a [taxi_type] .", # "user : what is the contact number for the taxi?", # "system : the contact number is [taxi_phone] . can i help you with anything else today ?", # "user : thanks that's all for today ." # ] # extractor = ValueExtractor() # extractor.start() # values = extractor.extract_value_candidates(dialog_history) # extractor.stop() # print('Extracted Values: ', values) \ No newline at end of file +import stanza import os from pathlib import Path from stanza.server import CoreNLPClient STOPWORDS_FILE = "../data/resource/stopwords.txt" CORENLP_DIR = str(Path().absolute()) + '/corenlp-dir' # properties for the CoreNLP Server ANNOTATORS = ['tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'regexner'] # install/download the latest version of CoreNLP stanza.install_corenlp(dir=CORENLP_DIR) extra_model_jar = os.path.join(CORENLP_DIR, 'stanford-corenlp-4.5.1-models-english-extra.jar') if not os.path.isfile(extra_model_jar): # download corenlp english models stanza.download_corenlp_models(model='english-extra', version='4.5.1', dir=CORENLP_DIR) else: print('English Extra CoreNLP models available!') # set environment var of installation location os.environ["CORENLP_HOME"] = CORENLP_DIR VALUES_CONVERT = { 'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10', 'not important': 'dont care', 'not expensive': 'cheap', 'not cheap': 'expensive' } # corenlp properties properties = { 'ner.applyFineGrained': False, 'pos.model': 'edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger', 'ner.model': 'edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz,' 'edu/stanford/nlp/models/ner/english.muc.7class.caseless.distsim.crf.ser.gz,' 'edu/stanford/nlp/models/ner/english.conll.4class.caseless.distsim.crf.ser.gz', 'ner.additional.regexner.mapping': str(Path().absolute()) + '/regexner.rules' } DATE_TIME_NUM = {'TIME', 'DATE', 'NUMBER'} class ValueExtractor: def __init__(self): # create the CoreNLP client self.client = CoreNLPClient( annotators=ANNOTATORS, properties=properties, timeout=300000, memory='8G', threads=8, be_quiet=True ) # load the stopwords.txt file self.stopwords = set() with open(STOPWORDS_FILE, 'r') as file: for line in file: self.stopwords.add(line.strip()) def start(self): self.client.start() def stop(self): self.client.stop() def extract_value_candidates(self, history): # format history and prepare text for annotation user_utterance = '' for utterance in history: # consider only user utterances if utterance.startswith('user :'): utterance = utterance[len('user :'):] # if sentence doesn't end with punctuation, add . if not utterance.endswith(('.', '?', '!')): utterance = utterance + '.' # append all utterances user_utterance = user_utterance + utterance + ' ' if not user_utterance: return [] value_candidates = [] user_utterance = user_utterance.replace("'", "") # use corenlp client and annotate text annotation = self.client.annotate(user_utterance) date_day = '' for sent in annotation.sentence: prev_word = '' for token in sent.token: word = token.word.strip() # extract Adjectives & Adverbs using POS tags # exclude the custom entity types if token.pos in ['JJ', 'RB'] \ and token.ner not in ['AREA', 'PLACE', 'PLACE_TYPE', 'FOOD_TYPE']: # check if the word ends with 'ly' word = word.removesuffix('ly') if word not in self.stopwords and word not in value_candidates: if prev_word == 'not' and prev_word + ' ' + word in VALUES_CONVERT: word = VALUES_CONVERT[prev_word + ' ' + word] value_candidates.append(word) prev_word = word # extract day, time & numbers if token.ner in DATE_TIME_NUM: if word in VALUES_CONVERT: word = VALUES_CONVERT[word] if token.ner == "DATE" and token.pos not in ["NNP", "CD"]: continue if token.ner == "DATE" and token.pos == "NNP": date_day = word continue if word not in self.stopwords: if token.ner == "TIME": if not word[0].isdigit(): continue if len(word) == 4: word = word.zfill(5) if word in value_candidates: continue value_candidates.append(word) # add day/date to value candidates if date_day: value_candidates.append(date_day) entity_map = {} # extract named entities (place, area,...) for sent in annotation.sentence: for mention in sent.mentions: entity_text = mention.entityMentionText.strip() if mention.entityType not in {'TIME', 'DATE', 'NUMBER', 'DURATION'} \ and entity_text not in value_candidates: if mention.entityType in ['AREA', 'FOOD_TYPE', 'PLACE_TYPE']: entity_map[mention.entityType] = entity_text else: value_candidates.append(entity_text) more_values = list(entity_map.values()) return value_candidates + more_values \ No newline at end of file