import stanza import os from pathlib import Path from stanza.server import CoreNLPClient STOPWORDS_FILE = "../data/resource/stopwords.txt" CORENLP_DIR = str(Path().absolute()) + '/corenlp-dir' # properties for the CoreNLP Server ANNOTATORS = ['tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'regexner'] # install/download the latest version of CoreNLP stanza.install_corenlp(dir=CORENLP_DIR) extra_model_jar = os.path.join(CORENLP_DIR, 'stanford-corenlp-4.5.1-models-english-extra.jar') if not os.path.isfile(extra_model_jar): # download corenlp english models stanza.download_corenlp_models(model='english-extra', version='4.5.1', dir=CORENLP_DIR) else: print('English Extra CoreNLP models available!') # set environment var of installation location os.environ["CORENLP_HOME"] = CORENLP_DIR VALUES_CONVERT = { 'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10', 'not important': 'dont care', 'not expensive': 'cheap' } # corenlp properties properties = { 'ner.applyFineGrained': False, 'pos.model': 'edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger', 'ner.model': 'edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz,' 'edu/stanford/nlp/models/ner/english.muc.7class.caseless.distsim.crf.ser.gz,' 'edu/stanford/nlp/models/ner/english.conll.4class.caseless.distsim.crf.ser.gz', 'ner.additional.regexner.mapping': str(Path().absolute()) + '/regexner.rules' } DATE_TIME_NUM = {'TIME', 'DATE', 'NUMBER'} class ValueExtractor: def __init__(self): # create the CoreNLP client self.client = CoreNLPClient( annotators=ANNOTATORS, properties=properties, timeout=300000, memory='8G', threads=8, be_quiet=True ) # load the stopwords.txt file self.stopwords = set() with open(STOPWORDS_FILE, 'r') as file: for line in file: self.stopwords.add(line.strip()) def start(self): self.client.start() def stop(self): self.client.stop() def extract_value_candidates(self, history): # format history and prepare text for annotation user_utterance = '' for utterance in history: # consider only user utterances if utterance.startswith('user :'): utterance = utterance[len('user :'):] # if sentence doesn't end with punctuation, add . if not utterance.endswith(('.', '?', '!')): utterance = utterance + '.' # append all utterances user_utterance = user_utterance + utterance + ' ' if not user_utterance: return [] value_candidates = [] user_utterance = user_utterance.replace("'", "") # use corenlp client and annotate text annotation = self.client.annotate(user_utterance) for sent in annotation.sentence: prev_word = '' for token in sent.token: # TODO :: remove # print("{:12s}\t{:12s}\t{:6s}\t{}".format(token.word, token.lemma, token.pos, token.ner)) word = token.word.strip() # extract Adjectives & Adverbs using POS tags # exclude the custom entity types if token.pos in ['JJ', 'RB'] \ and token.ner not in ['AREA', 'PLACE', 'PLACE_TYPE', 'FOOD_TYPE']: # check if the word ends with 'ly' word = word.removesuffix('ly') if word not in self.stopwords and word not in value_candidates: if prev_word == 'not' and prev_word + ' ' + word in VALUES_CONVERT: word = VALUES_CONVERT[prev_word + ' ' + word] value_candidates.append(word) prev_word = word # extract day, time & numbers if token.ner in DATE_TIME_NUM: if word in VALUES_CONVERT: word = VALUES_CONVERT[word] if token.ner == "DATE" and token.pos not in ["NNP", "CD"]: continue if word not in self.stopwords: if token.ner == "TIME": if not word[0].isdigit(): continue if len(word) == 4: word = word.zfill(5) if word in value_candidates: continue value_candidates.append(word) entity_map = {} # extract named entities (place, area,...) for sent in annotation.sentence: for mention in sent.mentions: # TODO :: remove # print("{:30s}\t{}".format(mention.entityMentionText, mention.entityType)) entity_text = mention.entityMentionText.strip() if mention.entityType not in {'TIME', 'DATE', 'NUMBER', 'DURATION'} \ and entity_text not in value_candidates: if mention.entityType in ['AREA', 'FOOD_TYPE', 'PLACE_TYPE']: entity_map[mention.entityType] = entity_text else: value_candidates.append(entity_text) more_values = list(entity_map.values()) return value_candidates + more_values # sample test TODO:: remove # dialog_history = [ # "user : i need a taxi to go to mahal of cambridge", # "system : i can help with that . did you have a specific time in mind ?", # "user : yes, i'd like to leave after 2:30 please.", # "system : where would you like to depart from ?", # "user : i am departing from jesus green outdoor pool.", # "system : your taxi is booked . it is a [taxi_type] .", # "user : what is the contact number for the taxi?", # "system : the contact number is [taxi_phone] . can i help you with anything else today ?", # "user : thanks that's all for today ." # ] # extractor = ValueExtractor() # extractor.start() # values = extractor.extract_value_candidates(dialog_history) # extractor.stop() # print('Extracted Values: ', values)