You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1 line
6.3 KiB
1 line
6.3 KiB
import stanza
|
|
import os
|
|
from pathlib import Path
|
|
from stanza.server import CoreNLPClient
|
|
|
|
STOPWORDS_FILE = "../data/resource/stopwords.txt"
|
|
CORENLP_DIR = str(Path().absolute()) + '/corenlp-dir'
|
|
# properties for the CoreNLP Server
|
|
ANNOTATORS = ['tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'regexner']
|
|
|
|
# install/download the latest version of CoreNLP
|
|
stanza.install_corenlp(dir=CORENLP_DIR)
|
|
|
|
extra_model_jar = os.path.join(CORENLP_DIR, 'stanford-corenlp-4.5.1-models-english-extra.jar')
|
|
if not os.path.isfile(extra_model_jar):
|
|
# download corenlp english models
|
|
stanza.download_corenlp_models(model='english-extra', version='4.5.1', dir=CORENLP_DIR)
|
|
else:
|
|
print('English Extra CoreNLP models available!')
|
|
|
|
# set environment var of installation location
|
|
os.environ["CORENLP_HOME"] = CORENLP_DIR
|
|
|
|
VALUES_CONVERT = {
|
|
'zero': '0',
|
|
'one': '1',
|
|
'two': '2',
|
|
'three': '3',
|
|
'four': '4',
|
|
'five': '5',
|
|
'six': '6',
|
|
'seven': '7',
|
|
'eight': '8',
|
|
'nine': '9',
|
|
'ten': '10',
|
|
'not important': 'dont care',
|
|
'not expensive': 'cheap'
|
|
}
|
|
|
|
# corenlp properties
|
|
properties = {
|
|
'ner.applyFineGrained': False,
|
|
'pos.model': 'edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger',
|
|
'ner.model': 'edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz,'
|
|
'edu/stanford/nlp/models/ner/english.muc.7class.caseless.distsim.crf.ser.gz,'
|
|
'edu/stanford/nlp/models/ner/english.conll.4class.caseless.distsim.crf.ser.gz',
|
|
'ner.additional.regexner.mapping': str(Path().absolute()) + '/regexner.rules'
|
|
}
|
|
|
|
DATE_TIME_NUM = {'TIME', 'DATE', 'NUMBER'}
|
|
|
|
|
|
class ValueExtractor:
|
|
|
|
def __init__(self):
|
|
# create the CoreNLP client
|
|
self.client = CoreNLPClient(
|
|
annotators=ANNOTATORS,
|
|
properties=properties,
|
|
timeout=300000,
|
|
memory='8G',
|
|
threads=8,
|
|
be_quiet=True
|
|
)
|
|
|
|
# load the stopwords.txt file
|
|
self.stopwords = set()
|
|
with open(STOPWORDS_FILE, 'r') as file:
|
|
for line in file:
|
|
self.stopwords.add(line.strip())
|
|
|
|
def start(self):
|
|
self.client.start()
|
|
|
|
def stop(self):
|
|
self.client.stop()
|
|
|
|
def extract_value_candidates(self, history):
|
|
|
|
# format history and prepare text for annotation
|
|
user_utterance = ''
|
|
for utterance in history:
|
|
# consider only user utterances
|
|
if utterance.startswith('user :'):
|
|
utterance = utterance[len('user :'):]
|
|
# if sentence doesn't end with punctuation, add .
|
|
if not utterance.endswith(('.', '?', '!')):
|
|
utterance = utterance + '.'
|
|
# append all utterances
|
|
user_utterance = user_utterance + utterance + ' '
|
|
|
|
if not user_utterance:
|
|
return []
|
|
|
|
value_candidates = []
|
|
user_utterance = user_utterance.replace("'", "")
|
|
# use corenlp client and annotate text
|
|
annotation = self.client.annotate(user_utterance)
|
|
for sent in annotation.sentence:
|
|
prev_word = ''
|
|
for token in sent.token:
|
|
# TODO :: remove
|
|
# print("{:12s}\t{:12s}\t{:6s}\t{}".format(token.word, token.lemma, token.pos, token.ner))
|
|
word = token.word.strip()
|
|
# extract Adjectives & Adverbs using POS tags
|
|
# exclude the custom entity types
|
|
if token.pos in ['JJ', 'RB'] \
|
|
and token.ner not in ['AREA', 'PLACE', 'PLACE_TYPE', 'FOOD_TYPE']:
|
|
# check if the word ends with 'ly'
|
|
word = word.removesuffix('ly')
|
|
if word not in self.stopwords and word not in value_candidates:
|
|
if prev_word == 'not' and prev_word + ' ' + word in VALUES_CONVERT:
|
|
word = VALUES_CONVERT[prev_word + ' ' + word]
|
|
value_candidates.append(word)
|
|
|
|
prev_word = word
|
|
|
|
# extract day, time & numbers
|
|
if token.ner in DATE_TIME_NUM:
|
|
if word in VALUES_CONVERT:
|
|
word = VALUES_CONVERT[word]
|
|
if token.ner == "DATE" and token.pos not in ["NNP", "CD"]:
|
|
continue
|
|
if word not in self.stopwords:
|
|
if token.ner == "TIME":
|
|
if not word[0].isdigit():
|
|
continue
|
|
if len(word) == 4:
|
|
word = word.zfill(5)
|
|
if word in value_candidates:
|
|
continue
|
|
value_candidates.append(word)
|
|
|
|
entity_map = {}
|
|
# extract named entities (place, area,...)
|
|
for sent in annotation.sentence:
|
|
for mention in sent.mentions:
|
|
# TODO :: remove
|
|
# print("{:30s}\t{}".format(mention.entityMentionText, mention.entityType))
|
|
entity_text = mention.entityMentionText.strip()
|
|
if mention.entityType not in {'TIME', 'DATE', 'NUMBER', 'DURATION'} \
|
|
and entity_text not in value_candidates:
|
|
if mention.entityType in ['AREA', 'FOOD_TYPE', 'PLACE_TYPE']:
|
|
entity_map[mention.entityType] = entity_text
|
|
else:
|
|
value_candidates.append(entity_text)
|
|
more_values = list(entity_map.values())
|
|
|
|
return value_candidates + more_values
|
|
|
|
|
|
# sample test TODO:: remove
|
|
# dialog_history = [
|
|
# "user : i need a taxi to go to mahal of cambridge",
|
|
# "system : i can help with that . did you have a specific time in mind ?",
|
|
# "user : yes, i'd like to leave after 2:30 please.",
|
|
# "system : where would you like to depart from ?",
|
|
# "user : i am departing from jesus green outdoor pool.",
|
|
# "system : your taxi is booked . it is a [taxi_type] .",
|
|
# "user : what is the contact number for the taxi?",
|
|
# "system : the contact number is [taxi_phone] . can i help you with anything else today ?",
|
|
# "user : thanks that's all for today ."
|
|
# ]
|
|
# extractor = ValueExtractor()
|
|
# extractor.start()
|
|
# values = extractor.extract_value_candidates(dialog_history)
|
|
# extractor.stop()
|
|
# print('Extracted Values: ', values)
|