From 2d8e82ba5114442f352b4a347187ec4b49e65426 Mon Sep 17 00:00:00 2001
From: Pavan Mandava <pavan@noreply.git.pavanmandava.com>
Date: Sun, 4 Dec 2022 19:32:57 +0100
Subject: [PATCH] Updated README (added Value Extraction)

---
 ANALYSIS.md      |  3 +++
 README.md        | 35 +++++++++++++++++++++++++----------
 utils/corenlp.py |  2 +-
 3 files changed, 29 insertions(+), 11 deletions(-)
 create mode 100644 ANALYSIS.md

diff --git a/ANALYSIS.md b/ANALYSIS.md
new file mode 100644
index 0000000..db39dbb
--- /dev/null
+++ b/ANALYSIS.md
@@ -0,0 +1,3 @@
+# Analysis of results and outputs
+
+// TODO
\ No newline at end of file
diff --git a/README.md b/README.md
index d759c1c..4ebe702 100644
--- a/README.md
+++ b/README.md
@@ -180,15 +180,6 @@ python evaluate.py
 
 ## Prompt Learning Experiments
 
-### Data
-The data for training the prompt learning model is available under [data/prompt-learning](data/prompt-learning) directory.
-`create_dataset.py` ([link](utils/create_dataset.py)) has the scripts for converting/creating the data for training the prompt-based model.
-
-> **Note:**
-> Running `create_dataset.py` can take some time as it needs to download, install and run Stanford CoreNLP `stanza` package. This scripts downloads coreNLP files of size `~1GB` and requires significant amount of RAM and processor capabilities to run efficiently.
->
-> All the data required for training the prompt-based model is already available under the [data](data) directory of this repo.
-
 ### Install the requirements  
 After following the environment setup steps in the previous [section](#environment-setup), install the required python modules for prompt model training.  
 
@@ -198,6 +189,26 @@ cd prompt-learning
 pip install -r requirements.txt
 ```
 
+### Data
+The data for training the prompt learning model is available under [data/prompt-learning](data/prompt-learning) directory.
+`create_dataset.py` ([link](utils/create_dataset.py)) has the scripts for converting/creating the data for training the prompt-based model.
+
+### Value Extraction
+Value candidates are extracted from the user dialog history and are utilized in the testing/inference phase. These extracted values are given to the value-based prompt for generating slots at inference time. Stanford CoreNLP (`stanza` package) is used to first extract POS tags and named entities. A set of rules are used to extract values from POS tags and named entities:
+ - Adjectives (`JJ`) and Adverbs (`RB`) are considered as possible values
+	 - Example: *expensive*, *moderate* 
+ - Consider previous negator `not`
+	 -  Example: *not important* (= dont care)
+ - Named entities (place names, time, date/day, numbers)
+	 - Example: *08:30*, *friday*
+ - Custom set of Regex NER rules for recognizing named entities
+ - Stop words and repeated candidate values are filtered out
+
+> **Note:**
+> Running `create_dataset.py` can take some time as it needs to download, install and run Stanford CoreNLP `stanza` package. This script also downloads coreNLP files of size about `~1GB` and requires significant amount of RAM and processor capabilities to run this efficiently.
+>
+> All the data required for training the prompt-based model is already available under the [data](data) directory of this repo. For reproducing the results, it's not required to run this script.
+
 ### Train the prompt model  
 Train a separate model for each data split. Edit the [train_prompting.sh](prompt-learning/train_prompting.sh) file to modify the default hyperparameters for training (learning rate, epochs).
 ```shell  
@@ -254,7 +265,7 @@ python evaluate.py -o path/to/outputs/file
 | 125-dpd | 46.49 | 91.86 |  
 | 250-dpd | 47.06 | 92.08 |  
 
-> **Note:** All the generated output files for the above reported results are available in the repository. Check [outputs/prompt-learning](outputs/prompt-learning) directory to see the output JSON files for each data-split.
+> **Note:** All the generated output files for the above reported results are available in this repository. Check [outputs/prompt-learning](outputs/prompt-learning) directory to see the output JSON files for each data-split.
 
 ## Multi-prompt Learning Experiments
 
@@ -313,3 +324,7 @@ sh test_prompting.sh -m <tuned-prompt-model-path>
 
 > **Note:** All the generated output files for the above reported results are available in this repository. Check [outputs/multi-prompt](outputs/multi-prompt) directory to see the output JSON files for each data-split. 
 
+
+## Analysis
+
+Analyses of the results and belief state generations (outputs) can be found [here](ANALYSIS.md).
\ No newline at end of file
diff --git a/utils/corenlp.py b/utils/corenlp.py
index 8d2f92e..f5a5437 100644
--- a/utils/corenlp.py
+++ b/utils/corenlp.py
@@ -1 +1 @@
-import stanzaimport osfrom pathlib import Pathfrom stanza.server import CoreNLPClientSTOPWORDS_FILE = "../data/resource/stopwords.txt"CORENLP_DIR = str(Path().absolute()) + '/corenlp-dir'# properties for the CoreNLP ServerANNOTATORS = ['tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'regexner']# install/download the latest version of CoreNLPstanza.install_corenlp(dir=CORENLP_DIR)extra_model_jar = os.path.join(CORENLP_DIR, 'stanford-corenlp-4.5.1-models-english-extra.jar')if not os.path.isfile(extra_model_jar):    # download corenlp english models    stanza.download_corenlp_models(model='english-extra', version='4.5.1', dir=CORENLP_DIR)else:    print('English Extra CoreNLP models available!')# set environment var of installation locationos.environ["CORENLP_HOME"] = CORENLP_DIRVALUES_CONVERT = {    'zero': '0',    'one': '1',    'two': '2',    'three': '3',    'four': '4',    'five': '5',    'six': '6',    'seven': '7',    'eight': '8',    'nine': '9',    'ten': '10',    'not important': 'dont care',    'not expensive': 'cheap'}# corenlp propertiesproperties = {    'ner.applyFineGrained': False,    'pos.model': 'edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger',    'ner.model': 'edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz,'                 'edu/stanford/nlp/models/ner/english.muc.7class.caseless.distsim.crf.ser.gz,'                 'edu/stanford/nlp/models/ner/english.conll.4class.caseless.distsim.crf.ser.gz',    'ner.additional.regexner.mapping': str(Path().absolute()) + '/regexner.rules'}DATE_TIME_NUM = {'TIME', 'DATE', 'NUMBER'}class ValueExtractor:    def __init__(self):        # create the CoreNLP client        self.client = CoreNLPClient(            annotators=ANNOTATORS,            properties=properties,            timeout=300000,            memory='8G',            threads=8,            be_quiet=True        )        # load the stopwords.txt file        self.stopwords = set()        with open(STOPWORDS_FILE, 'r') as file:            for line in file:                self.stopwords.add(line.strip())    def start(self):        self.client.start()    def stop(self):        self.client.stop()    def extract_value_candidates(self, history):        # format history and prepare text for annotation        user_utterance = ''        for utterance in history:            # consider only user utterances            if utterance.startswith('user :'):                utterance = utterance[len('user :'):]                # if sentence doesn't end with punctuation, add .                if not utterance.endswith(('.', '?', '!')):                    utterance = utterance + '.'                # append all utterances                user_utterance = user_utterance + utterance + ' '        if not user_utterance:            return []        value_candidates = []        user_utterance = user_utterance.replace("'", "")        # use corenlp client and annotate text        annotation = self.client.annotate(user_utterance)        date_day = ''        for sent in annotation.sentence:            prev_word = ''            for token in sent.token:                # TODO :: remove                # print("{:12s}\t{:12s}\t{:6s}\t{}".format(token.word, token.lemma, token.pos, token.ner))                word = token.word.strip()                # extract Adjectives & Adverbs using POS tags                # exclude the custom entity types                if token.pos in ['JJ', 'RB'] \                        and token.ner not in ['AREA', 'PLACE', 'PLACE_TYPE', 'FOOD_TYPE']:                    # check if the word ends with 'ly'                    word = word.removesuffix('ly')                    if word not in self.stopwords and word not in value_candidates:                        if prev_word == 'not' and prev_word + ' ' + word in VALUES_CONVERT:                            word = VALUES_CONVERT[prev_word + ' ' + word]                        value_candidates.append(word)                prev_word = word                # extract day, time & numbers                if token.ner in DATE_TIME_NUM:                    if word in VALUES_CONVERT:                        word = VALUES_CONVERT[word]                    if token.ner == "DATE" and token.pos not in ["NNP", "CD"]:                        continue                    if token.ner == "DATE" and token.pos == "NNP":                        date_day = word                        continue                    if word not in self.stopwords:                        if token.ner == "TIME":                            if not word[0].isdigit():                                continue                            if len(word) == 4:                                word = word.zfill(5)                        if word in value_candidates:                            continue                        value_candidates.append(word)        # add day/date to value candidates        if date_day:            value_candidates.append(date_day)        entity_map = {}        # extract named entities (place, area,...)        for sent in annotation.sentence:            for mention in sent.mentions:                # TODO :: remove                # print("{:30s}\t{}".format(mention.entityMentionText, mention.entityType))                entity_text = mention.entityMentionText.strip()                if mention.entityType not in {'TIME', 'DATE', 'NUMBER', 'DURATION'} \                        and entity_text not in value_candidates:                    if mention.entityType in ['AREA', 'FOOD_TYPE', 'PLACE_TYPE']:                        entity_map[mention.entityType] = entity_text                    else:                        value_candidates.append(entity_text)        more_values = list(entity_map.values())        return value_candidates + more_values# sample test TODO:: remove# dialog_history = [#     "user : i need a taxi to go to mahal of cambridge",#     "system : i can help with that . did you have a specific time in mind ?",#     "user : yes, i'd like to leave after 2:30 please.",#     "system : where would you like to depart from ?",#     "user : i am departing from jesus green outdoor pool.",#     "system : your taxi is booked . it is a [taxi_type] .",#     "user : what is the contact number for the taxi?",#     "system : the contact number is [taxi_phone] . can i help you with anything else today ?",#     "user : thanks that's all for today ."# ]# extractor = ValueExtractor()# extractor.start()# values = extractor.extract_value_candidates(dialog_history)# extractor.stop()# print('Extracted Values: ', values)
\ No newline at end of file
+import stanzaimport osfrom pathlib import Pathfrom stanza.server import CoreNLPClientSTOPWORDS_FILE = "../data/resource/stopwords.txt"CORENLP_DIR = str(Path().absolute()) + '/corenlp-dir'# properties for the CoreNLP ServerANNOTATORS = ['tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'regexner']# install/download the latest version of CoreNLPstanza.install_corenlp(dir=CORENLP_DIR)extra_model_jar = os.path.join(CORENLP_DIR, 'stanford-corenlp-4.5.1-models-english-extra.jar')if not os.path.isfile(extra_model_jar):    # download corenlp english models    stanza.download_corenlp_models(model='english-extra', version='4.5.1', dir=CORENLP_DIR)else:    print('English Extra CoreNLP models available!')# set environment var of installation locationos.environ["CORENLP_HOME"] = CORENLP_DIRVALUES_CONVERT = {    'zero': '0',    'one': '1',    'two': '2',    'three': '3',    'four': '4',    'five': '5',    'six': '6',    'seven': '7',    'eight': '8',    'nine': '9',    'ten': '10',    'not important': 'dont care',    'not expensive': 'cheap',    'not cheap': 'expensive'}# corenlp propertiesproperties = {    'ner.applyFineGrained': False,    'pos.model': 'edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger',    'ner.model': 'edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz,'                 'edu/stanford/nlp/models/ner/english.muc.7class.caseless.distsim.crf.ser.gz,'                 'edu/stanford/nlp/models/ner/english.conll.4class.caseless.distsim.crf.ser.gz',    'ner.additional.regexner.mapping': str(Path().absolute()) + '/regexner.rules'}DATE_TIME_NUM = {'TIME', 'DATE', 'NUMBER'}class ValueExtractor:    def __init__(self):        # create the CoreNLP client        self.client = CoreNLPClient(            annotators=ANNOTATORS,            properties=properties,            timeout=300000,            memory='8G',            threads=8,            be_quiet=True        )        # load the stopwords.txt file        self.stopwords = set()        with open(STOPWORDS_FILE, 'r') as file:            for line in file:                self.stopwords.add(line.strip())    def start(self):        self.client.start()    def stop(self):        self.client.stop()    def extract_value_candidates(self, history):        # format history and prepare text for annotation        user_utterance = ''        for utterance in history:            # consider only user utterances            if utterance.startswith('user :'):                utterance = utterance[len('user :'):]                # if sentence doesn't end with punctuation, add .                if not utterance.endswith(('.', '?', '!')):                    utterance = utterance + '.'                # append all utterances                user_utterance = user_utterance + utterance + ' '        if not user_utterance:            return []        value_candidates = []        user_utterance = user_utterance.replace("'", "")        # use corenlp client and annotate text        annotation = self.client.annotate(user_utterance)        date_day = ''        for sent in annotation.sentence:            prev_word = ''            for token in sent.token:                word = token.word.strip()                # extract Adjectives & Adverbs using POS tags                # exclude the custom entity types                if token.pos in ['JJ', 'RB'] \                        and token.ner not in ['AREA', 'PLACE', 'PLACE_TYPE', 'FOOD_TYPE']:                    # check if the word ends with 'ly'                    word = word.removesuffix('ly')                    if word not in self.stopwords and word not in value_candidates:                        if prev_word == 'not' and prev_word + ' ' + word in VALUES_CONVERT:                            word = VALUES_CONVERT[prev_word + ' ' + word]                        value_candidates.append(word)                prev_word = word                # extract day, time & numbers                if token.ner in DATE_TIME_NUM:                    if word in VALUES_CONVERT:                        word = VALUES_CONVERT[word]                    if token.ner == "DATE" and token.pos not in ["NNP", "CD"]:                        continue                    if token.ner == "DATE" and token.pos == "NNP":                        date_day = word                        continue                    if word not in self.stopwords:                        if token.ner == "TIME":                            if not word[0].isdigit():                                continue                            if len(word) == 4:                                word = word.zfill(5)                        if word in value_candidates:                            continue                        value_candidates.append(word)        # add day/date to value candidates        if date_day:            value_candidates.append(date_day)        entity_map = {}        # extract named entities (place, area,...)        for sent in annotation.sentence:            for mention in sent.mentions:                entity_text = mention.entityMentionText.strip()                if mention.entityType not in {'TIME', 'DATE', 'NUMBER', 'DURATION'} \                        and entity_text not in value_candidates:                    if mention.entityType in ['AREA', 'FOOD_TYPE', 'PLACE_TYPE']:                        entity_map[mention.entityType] = entity_text                    else:                        value_candidates.append(entity_text)        more_values = list(entity_map.values())        return value_candidates + more_values
\ No newline at end of file