You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1 line
6.6 KiB

import stanza
import os
from pathlib import Path
from stanza.server import CoreNLPClient
STOPWORDS_FILE = "../data/resource/stopwords.txt"
CORENLP_DIR = str(Path().absolute()) + '/corenlp-dir'
# properties for the CoreNLP Server
ANNOTATORS = ['tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'regexner']
# install/download the latest version of CoreNLP
stanza.install_corenlp(dir=CORENLP_DIR)
extra_model_jar = os.path.join(CORENLP_DIR, 'stanford-corenlp-4.5.1-models-english-extra.jar')
if not os.path.isfile(extra_model_jar):
# download corenlp english models
stanza.download_corenlp_models(model='english-extra', version='4.5.1', dir=CORENLP_DIR)
else:
print('English Extra CoreNLP models available!')
# set environment var of installation location
os.environ["CORENLP_HOME"] = CORENLP_DIR
VALUES_CONVERT = {
'zero': '0',
'one': '1',
'two': '2',
'three': '3',
'four': '4',
'five': '5',
'six': '6',
'seven': '7',
'eight': '8',
'nine': '9',
'ten': '10',
'not important': 'dont care',
'not expensive': 'cheap'
}
# corenlp properties
properties = {
'ner.applyFineGrained': False,
'pos.model': 'edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger',
'ner.model': 'edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz,'
'edu/stanford/nlp/models/ner/english.muc.7class.caseless.distsim.crf.ser.gz,'
'edu/stanford/nlp/models/ner/english.conll.4class.caseless.distsim.crf.ser.gz',
'ner.additional.regexner.mapping': str(Path().absolute()) + '/regexner.rules'
}
DATE_TIME_NUM = {'TIME', 'DATE', 'NUMBER'}
class ValueExtractor:
def __init__(self):
# create the CoreNLP client
self.client = CoreNLPClient(
annotators=ANNOTATORS,
properties=properties,
timeout=300000,
memory='8G',
threads=8,
be_quiet=True
)
# load the stopwords.txt file
self.stopwords = set()
with open(STOPWORDS_FILE, 'r') as file:
for line in file:
self.stopwords.add(line.strip())
def start(self):
self.client.start()
def stop(self):
self.client.stop()
def extract_value_candidates(self, history):
# format history and prepare text for annotation
user_utterance = ''
for utterance in history:
# consider only user utterances
if utterance.startswith('user :'):
utterance = utterance[len('user :'):]
# if sentence doesn't end with punctuation, add .
if not utterance.endswith(('.', '?', '!')):
utterance = utterance + '.'
# append all utterances
user_utterance = user_utterance + utterance + ' '
if not user_utterance:
return []
value_candidates = []
user_utterance = user_utterance.replace("'", "")
# use corenlp client and annotate text
annotation = self.client.annotate(user_utterance)
date_day = ''
for sent in annotation.sentence:
prev_word = ''
for token in sent.token:
# TODO :: remove
# print("{:12s}\t{:12s}\t{:6s}\t{}".format(token.word, token.lemma, token.pos, token.ner))
word = token.word.strip()
# extract Adjectives & Adverbs using POS tags
# exclude the custom entity types
if token.pos in ['JJ', 'RB'] \
and token.ner not in ['AREA', 'PLACE', 'PLACE_TYPE', 'FOOD_TYPE']:
# check if the word ends with 'ly'
word = word.removesuffix('ly')
if word not in self.stopwords and word not in value_candidates:
if prev_word == 'not' and prev_word + ' ' + word in VALUES_CONVERT:
word = VALUES_CONVERT[prev_word + ' ' + word]
value_candidates.append(word)
prev_word = word
# extract day, time & numbers
if token.ner in DATE_TIME_NUM:
if word in VALUES_CONVERT:
word = VALUES_CONVERT[word]
if token.ner == "DATE" and token.pos not in ["NNP", "CD"]:
continue
if token.ner == "DATE" and token.pos == "NNP":
date_day = word
continue
if word not in self.stopwords:
if token.ner == "TIME":
if not word[0].isdigit():
continue
if len(word) == 4:
word = word.zfill(5)
if word in value_candidates:
continue
value_candidates.append(word)
# add day/date to value candidates
if date_day:
value_candidates.append(date_day)
entity_map = {}
# extract named entities (place, area,...)
for sent in annotation.sentence:
for mention in sent.mentions:
# TODO :: remove
# print("{:30s}\t{}".format(mention.entityMentionText, mention.entityType))
entity_text = mention.entityMentionText.strip()
if mention.entityType not in {'TIME', 'DATE', 'NUMBER', 'DURATION'} \
and entity_text not in value_candidates:
if mention.entityType in ['AREA', 'FOOD_TYPE', 'PLACE_TYPE']:
entity_map[mention.entityType] = entity_text
else:
value_candidates.append(entity_text)
more_values = list(entity_map.values())
return value_candidates + more_values
# sample test TODO:: remove
# dialog_history = [
# "user : i need a taxi to go to mahal of cambridge",
# "system : i can help with that . did you have a specific time in mind ?",
# "user : yes, i'd like to leave after 2:30 please.",
# "system : where would you like to depart from ?",
# "user : i am departing from jesus green outdoor pool.",
# "system : your taxi is booked . it is a [taxi_type] .",
# "user : what is the contact number for the taxi?",
# "system : the contact number is [taxi_phone] . can i help you with anything else today ?",
# "user : thanks that's all for today ."
# ]
# extractor = ValueExtractor()
# extractor.start()
# values = extractor.extract_value_candidates(dialog_history)
# extractor.stop()
# print('Extracted Values: ', values)