You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1 line
5.6 KiB
1 line
5.6 KiB
import stanza
|
|
import os
|
|
from pathlib import Path
|
|
from stanza.server import CoreNLPClient
|
|
|
|
STOPWORDS_FILE = "../data/resource/stopwords.txt"
|
|
CORENLP_DIR = str(Path().absolute()) + '/corenlp-dir'
|
|
# properties for the CoreNLP Server
|
|
ANNOTATORS = ['tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'regexner']
|
|
|
|
# install/download the latest version of CoreNLP
|
|
stanza.install_corenlp(dir=CORENLP_DIR)
|
|
|
|
extra_model_jar = os.path.join(CORENLP_DIR, 'stanford-corenlp-4.5.1-models-english-extra.jar')
|
|
if not os.path.isfile(extra_model_jar):
|
|
# download corenlp english models
|
|
stanza.download_corenlp_models(model='english-extra', version='4.5.1', dir=CORENLP_DIR)
|
|
else:
|
|
print('English Extra CoreNLP models available!')
|
|
|
|
# set environment var of installation location
|
|
os.environ["CORENLP_HOME"] = CORENLP_DIR
|
|
|
|
VALUES_CONVERT = {
|
|
'zero': '0',
|
|
'one': '1',
|
|
'two': '2',
|
|
'three': '3',
|
|
'four': '4',
|
|
'five': '5',
|
|
'six': '6',
|
|
'seven': '7',
|
|
'eight': '8',
|
|
'nine': '9',
|
|
'ten': '10',
|
|
'not important': 'dont care',
|
|
'not expensive': 'cheap',
|
|
'not cheap': 'expensive'
|
|
}
|
|
|
|
# corenlp properties
|
|
properties = {
|
|
'ner.applyFineGrained': False,
|
|
'pos.model': 'edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger',
|
|
'ner.model': 'edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz,'
|
|
'edu/stanford/nlp/models/ner/english.muc.7class.caseless.distsim.crf.ser.gz,'
|
|
'edu/stanford/nlp/models/ner/english.conll.4class.caseless.distsim.crf.ser.gz',
|
|
'ner.additional.regexner.mapping': str(Path().absolute()) + '/regexner.rules'
|
|
}
|
|
|
|
DATE_TIME_NUM = {'TIME', 'DATE', 'NUMBER'}
|
|
|
|
|
|
class ValueExtractor:
|
|
|
|
def __init__(self):
|
|
# create the CoreNLP client
|
|
self.client = CoreNLPClient(
|
|
annotators=ANNOTATORS,
|
|
properties=properties,
|
|
timeout=300000,
|
|
memory='8G',
|
|
threads=8,
|
|
be_quiet=True
|
|
)
|
|
|
|
# load the stopwords.txt file
|
|
self.stopwords = set()
|
|
with open(STOPWORDS_FILE, 'r') as file:
|
|
for line in file:
|
|
self.stopwords.add(line.strip())
|
|
|
|
def start(self):
|
|
self.client.start()
|
|
|
|
def stop(self):
|
|
self.client.stop()
|
|
|
|
def extract_value_candidates(self, history):
|
|
|
|
# format history and prepare text for annotation
|
|
user_utterance = ''
|
|
for utterance in history:
|
|
# consider only user utterances
|
|
if utterance.startswith('user :'):
|
|
utterance = utterance[len('user :'):]
|
|
# if sentence doesn't end with punctuation, add .
|
|
if not utterance.endswith(('.', '?', '!')):
|
|
utterance = utterance + '.'
|
|
# append all utterances
|
|
user_utterance = user_utterance + utterance + ' '
|
|
|
|
if not user_utterance:
|
|
return []
|
|
|
|
value_candidates = []
|
|
user_utterance = user_utterance.replace("'", "")
|
|
# use corenlp client and annotate text
|
|
annotation = self.client.annotate(user_utterance)
|
|
date_day = ''
|
|
for sent in annotation.sentence:
|
|
prev_word = ''
|
|
for token in sent.token:
|
|
word = token.word.strip()
|
|
# extract Adjectives & Adverbs using POS tags
|
|
# exclude the custom entity types
|
|
if token.pos in ['JJ', 'RB'] \
|
|
and token.ner not in ['AREA', 'PLACE', 'PLACE_TYPE', 'FOOD_TYPE']:
|
|
# check if the word ends with 'ly'
|
|
word = word.removesuffix('ly')
|
|
if word not in self.stopwords and word not in value_candidates:
|
|
if prev_word == 'not' and prev_word + ' ' + word in VALUES_CONVERT:
|
|
word = VALUES_CONVERT[prev_word + ' ' + word]
|
|
value_candidates.append(word)
|
|
|
|
prev_word = word
|
|
|
|
# extract day, time & numbers
|
|
if token.ner in DATE_TIME_NUM:
|
|
if word in VALUES_CONVERT:
|
|
word = VALUES_CONVERT[word]
|
|
if token.ner == "DATE" and token.pos not in ["NNP", "CD"]:
|
|
continue
|
|
if token.ner == "DATE" and token.pos == "NNP":
|
|
date_day = word
|
|
continue
|
|
if word not in self.stopwords:
|
|
if token.ner == "TIME":
|
|
if not word[0].isdigit():
|
|
continue
|
|
if len(word) == 4:
|
|
word = word.zfill(5)
|
|
if word in value_candidates:
|
|
continue
|
|
value_candidates.append(word)
|
|
|
|
# add day/date to value candidates
|
|
if date_day:
|
|
value_candidates.append(date_day)
|
|
|
|
entity_map = {}
|
|
# extract named entities (place, area,...)
|
|
for sent in annotation.sentence:
|
|
for mention in sent.mentions:
|
|
entity_text = mention.entityMentionText.strip()
|
|
if mention.entityType not in {'TIME', 'DATE', 'NUMBER', 'DURATION'} \
|
|
and entity_text not in value_candidates:
|
|
if mention.entityType in ['AREA', 'FOOD_TYPE', 'PLACE_TYPE']:
|
|
entity_map[mention.entityType] = entity_text
|
|
else:
|
|
value_candidates.append(entity_text)
|
|
more_values = list(entity_map.values())
|
|
|
|
return value_candidates + more_values
|
|
|