You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

555 lines
18 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

\documentclass[10pt]{beamer}
\usepackage{beamerthemesplit}
\usepackage[utf8]{inputenc}
\usepackage[font=small,figurename=Fig]{caption}
\usepackage{graphicx}
\graphicspath{ {images/} }
\usepackage[style=authoryear, backend=biber]{biblatex}
\addbibresource{bibliography.bib}
\usetheme{Frankfurt}
\usecolortheme{default}
\title[Prompt-based methods for DST]{Prompt-based methods for Dialog State Tracking}
\subtitle{Thesis Presentation}
\author[Pavan Mandava]{Mandava, Sai Pavan}
\institute{Institut für Maschinelle Sprachverarbeitung\\
Universität Stuttgart}
\date[Thesis Presentation]{15.02.2023}
\AtBeginSection[]
{
\begin{frame}
\frametitle{Outline}
\tableofcontents[currentsection]
\end{frame}
}
\begin{document}
\frame{\titlepage}
\begin{frame}{Outline}
\tableofcontents
\end{frame}
\section{Introduction \& Motivation}
\begin{frame} \frametitle{Introduction}
\begin{itemize}
\item Task-oriented dialog systems
\begin{itemize}
\item perform a wide range of tasks across multiple domains
\item \textsl{E.g. ticket booking, restaurant booking, etc.}
\end{itemize}
\item Modular-based dialog systems
\begin{itemize}
\item NLU, DST, PL, NLG
\end{itemize}
\end{itemize}
\vspace{8pt}
\begin{figure}
\centering
\includegraphics[width=0.8\textwidth]{modular_tod.png}
\caption{Modular-based task-oriented dialog system}
\end{figure}
\end{frame}
\begin{frame} \frametitle{Dialog State Tracking (DST)}
\begin{itemize}
\item Essential module for the dialog system to understand user's requests
\item Tracks the user goals in the form of dialog states (or ``belief states")
\item Dialog states contains a set of \textsl{(slot, value)} pairs
\begin{itemize}
\item Updated at each turn of the conversation
\end{itemize}
\end{itemize}
\begin{block} {DST Example}
\textsf{\textbf{USER:}} Plan a train trip to Berlin this Friday for two people.\\
\textbf{Belief states:} \{(\textsl{destination, Berlin}), (\textsl{day, Friday}), (\textsl{people, 2})\}
\end{block}
\begin{itemize}
\item Ontology of domains
\begin{itemize}
\item Represents knowledge \& information required for specific tasks
\item Contains pre-defined set of slots and all possible values for each slot
\item Some Neural-based models solve the DST as classification task
\end{itemize}
\item Problems with depending on ontology
\begin{itemize}
\item Ontology is hard to obtain for new domains
\item Costly and time-consuming
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame} \frametitle{PLMs \& Prompt Learning}
\begin{itemize}
\item Pre-trained Language Models (PLMs)
\begin{itemize}
\item Trained on large amounts of textual data
\item Encode linguistic knowledge into the huge amount of parameters
\item Can be efficiently used to solve NLP tasks
\item BERT\parencite{devlin2019bert}, GPT-2\parencite{radford2019gpt2}, GPT-3\parencite{brown2020gpt3}
\end{itemize}
\item Prompt Learning
\begin{itemize}
\item New way of efficiently using the generation capabilities of PLMs to solve different language tasks
\item Downstream task is converted to a textual prompt and given as input, the PLM directly generates the outputs from prompts
\item Prompting methods can be effectively used under \textsl{zero-shot} and \textsl{few-shot} settings when there's not enough training data
\item GPT-3 \parencite{brown2020gpt3}, Few-shot Bot \parencite{madotto2021fsb}, \textsc{PET} \parencite{schick2021pet} explored prompt-based methods for several tasks
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame} \frametitle{Prompt Learning (contd.)}
\begin{figure}
\centering
\includegraphics[width=0.75\textwidth]{prompt_terminology.png}
\caption{Terminology and notations in prompt learning}
\end{figure}
\vspace{-4pt}
\begin{itemize}
\item Prompt Types: \textsl{prefix} \& \textsl{cloze} prompts
\item Prompt selection: manual, discrete, \& continuous prompts
\item Training strategy: Fixed-prompt LM Fine Tuning
\begin{itemize}
\item fixed prompts are applied to training data and fine-tune the LM
\item under low-resource few-shot settings
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame} \frametitle{Motivation \& Objectives}
\begin{itemize}
\item Previous work \& their limitations
\begin{itemize}
\item \textsc{TOD-BERT} \parencite{wu2020tod-bert}
\begin{itemize}
\item Pre-trained BERT on 9 different task-oriented datasets
\item Fine-tuned for DST task as multi-class classification
\item Depends on the ontology of domains for predicting slot-values
\end{itemize}
\item \textsc{Soloist} \parencite{peng2021soloist}
\begin{itemize}
\item Pre-trained GPT-2 for two dialogue datasets
\item Fine-tuned to generate belief states as sequence of words
\item Performs poorly under low-resource settings
\end{itemize}
\end{itemize}
\item Research Objectives
\begin{itemize}
\item Can the dialog states be extracted from the PLM using prompts?
\item Can the prompt-based methods learn the DST task under low-resource settings without depending on the ontology of domains?
\item Compare prompt-based approach with the baseline model
\item Identify the drawbacks \& limitations of prompt-based approach
\item Can different multi-prompt techniques help improve the performance of DST task?
\end{itemize}
\end{itemize}
\end{frame}
\section{Methods}
\begin{frame} \frametitle{Dataset - MultiWOZ \parencite{budzianowski2018multiwoz}}
\begin{itemize}
\item MultiWOZ 2.1 \parencite{eric2019multiwoz} is used to benchmark the DST
\item Contains huge number of dialogues across multiple domains
\item Each Dialog $\rightarrow$ multiple turns $\rightarrow$ multiple \textsl{(slot,value)} pairs
\item Five domains are picked for few-shot experiments
\begin{itemize}
\item \textsl{Restaurant, Hotel, Attraction, Taxi, Train}
\end{itemize}
\item Six data splits are created to perform few-shot experiments
\begin{itemize}
\item Different proportions of dialogues in each split
\item All the five domains are evenly distributed in each split
\end{itemize}
\end{itemize}
\begin{figure}
\centering
\includegraphics[width=0.75\textwidth]{data_splits.png}
%% \caption{Terminology and notations in prompt learning}
\end{figure}
\end{frame}
\begin{frame} \frametitle{Baseline (\textsc{Soloist})}
\begin{itemize}
\item \textsc{Soloist} \parencite{peng2021soloist} is the baseline model
\item Initialized with 12-layer GPT-2 language model
\item Pre-training step
\begin{itemize}
\item Pre-trained on two task-oriented dialogue datasets
\item Pre-trained model is publicly available
\end{itemize}
\item Fine-tuning step
\begin{itemize}
\item Fine-tuned on all MultiWOZ 2.1 data splits to perform the belief predictions task
\item Takes dialog history as input and generates belief states as sequence of words
\item \textsl{belief: $slot_1 = value_1; slot_2 = value_2, \ldots$}
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame} \frametitle{Prompt-based methods}
\begin{itemize}
\item \cite{yang2022prompt} proposed prompt learning framework for DST
\item This approach doesn't depend on the ontology of domains
\item Two components: \textsl{value-based prompt} and \textsl{inverse prompt}
\item Value-based prompt uses belief state values in prompts and generates the slots from PLM
\item Inverse prompt is an auxiliary task that uses the slot generated from value-based prompt and attempts to generate back the value.
\end{itemize}
\begin{figure}
\centering
\includegraphics[width=0.85\textwidth]{prompt_methods.png}
%% \caption{Terminology and notations in prompt learning}
\end{figure}
\end{frame}
\begin{frame} \frametitle{Prompt-based methods - Training}
\begin{table}[h!]
\centering
\begingroup
\setlength{\tabcolsep}{8pt} % Default value: 6pt
\renewcommand{\arraystretch}{1.1} % Default value: 1
\begin{tabular}{ll}
\hline
\multicolumn{1}{c}{\textbf{Type}} & \multicolumn{1}{c}{\textbf{Prompt templates}} \\
\hline
value-based prompt & belief states: value = [v], slot = [s] \\
inverse prompt & belief states: slot = [s], value = [v] \\
\hline
\end{tabular}
\endgroup
\end{table}
\begin{itemize}
\item The pre-trained Soloist is used to fine-tune the prompting methods
\item All MultiWOZ data splits are used in the fine-tuning phase
\item Loss function for value-based prompt
$$\mathcal{L}=-\sum_{t}^{|D|} \log P\left(s_{t} \mid c_{t}, f\left(v_{t}\right)\right)$$
\item Loss function for inverse prompt
$$\tilde{\mathcal{L}}=-\sum_{t}^{|D|} \log P\left(v^{\prime}_{t} \mid c_{t}, I\left(s_{t}\right)\right)$$
\item Total Loss: $\mathcal{L}^{*} = \mathcal{L} + w *\tilde{\mathcal{L}}$
\begin{itemize}
\item Experiments are performed on different inverse prompt weights $w$
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame} \frametitle{Prompt-based methods - Testing}
\begin{itemize}
\item Testing slot generation
\begin{itemize}
\item During inference, only value-based prompts are used
\item Prompts are filled with values and given as input to PLM
\item Next word with the highest probability is the generated slot
\item Rule-based approach for extracting value candidates
\end{itemize}
\item Rule-based Value Extraction:
\begin{itemize}
\item Stanford CoreNLP Stanza is used to first extract POS tags
\item Adjectives \textsf{(JJ)} and Adverbs \textsf{(RB)} are considered as possible values
\item Consider the previous negator `not'
\item Consider all named entities (name of place, time, day, numbers)
\item Custom Regex NER rules, filtered stop words and repeated values
\end{itemize}
\end{itemize}
\begin{figure}
\centering
\includegraphics[width=0.72\textwidth]{corenlp.png}
\end{figure}
\end{frame}
\begin{frame} \frametitle{Multi-prompt method (Prompt Ensemble)}
\begin{itemize}
\item Only a single value-based prompt is used in the previous experiments
\item Multiple prompts can be used together to improve the performance
\item Prompt Ensembling uses multiple value-based prompts during training and inference to take advantage of different prompts
\item Four hand-crafted prompt templates for value-based prompt
\end{itemize}
\begin{table}
\centering
\begin{tabular}{c l}
\hline
\multicolumn{2}{c}{\textbf{Prompt ensemble templates}}\\
\hline
$f_{1}$ & belief states: [v] = [s]\\
$f_{2}$ & [v] is the value of [s]\\
$f_{3}$ & [v] is of slot type [s]\\
$f_{4}$ & belief states: value = [v], slot = [s]\\
\hline
\end{tabular}
\end{table}
\begin{itemize}
\item A single model is trained with multiple prompts
\item The probability of generated slot over multiple prompt functions:
$$P\left(s_{t} \mid c_{t}\right)=\sum_{k}^{|K|} \alpha_{k} * P\left(s_{t} \mid c_{t}, f_{k}\left(v_{t}\right)\right)$$
\end{itemize}
\end{frame}
\begin{frame} \frametitle{Multi-prompt method (Prompt Augmentation)}
\begin{itemize}
\item Provides a few additional answered prompts that can demonstrate to the PLM how the actual task can be performed
\item Sample selection is manually hand-picked from training data
\item Experiments are performed on two sets of demonstration samples
\begin{itemize}
\item Sample set 1: 8 demonstrations
\item Sample set 2: 5 demonstrations
\end{itemize}
\item Demonstrations are concatenated to the input during inference
\item Number of demonstration examples that can be used is bounded by the GPT-2 max input length of 1024
\end{itemize}
\begin{table}
\centering
\begingroup
\setlength{\tabcolsep}{2pt}
\begin{tabular}{ r l }
\hline
\multicolumn{2}{c}{\textbf{Demonstration learning}} \\
\hline
Book a cheap flight to Frankfurt. & \textit{Frankfurt} is of slot \textit{destination}\\
Plan a train trip to Berlin. & \textit{Berlin} is of slot \textit{destination}\\
Book a taxi to the University. & \textit{University} is of slot \textit{destination}\\
Book a train to Stuttgart. & \textit{Stuttgart} is of slot [s]\\
\hline
\end{tabular}
\endgroup
\end{table}
\end{frame}
\begin{frame} \frametitle{Evaluation Metrics}
\begin{itemize}
\item Joint Goal Accuracy (JGA)
\begin{itemize}
\item Standard evaluation metric for DST
\item Correct if all the predicted belief states match with the ground-truth
\item All the slots and values must exactly match
\end{itemize}
\item Rule-based value extraction methods may extract irrelevant values
\item JGA* \parencite{yang2022prompt}
\begin{itemize}
\item To exclude the influence of wrongly extracted values, JGA* is used
\item JGA* - Joint Goal Accuracy is computed only for the belief states where the values are extracted correctly
\end{itemize}
\end{itemize}
\end{frame}
\section{Results}
\begin{frame} \frametitle{Baseline (\textsc{Soloist}) results}
\begin{figure}
\centering
\includegraphics[width=0.9\textwidth]{baseline_results.png}
\end{figure}
\end{frame}
\begin{frame} \frametitle{ Prompt-based methods}
\begin{figure}
\centering
\includegraphics[width=0.8\textwidth]{prompt_results.png}
\end{figure}
\end{frame}
\begin{frame} \frametitle{Prompt Ensemble results}
\begin{figure}
\centering
\includegraphics[width=0.9\textwidth]{ensemble_results.png}
\end{figure}
\end{frame}
\begin{frame} \frametitle{Prompt Augmentation results}
\begin{figure}
\centering
\includegraphics[width=0.9\textwidth]{demonstration_results.png}
\end{figure}
\end{frame}
\begin{frame} \frametitle{Comparison of results}
\begin{figure}
\centering
\includegraphics[width=0.83\textwidth]{comparison_results.png}
\end{figure}
\end{frame}
\section{Discussion}
\begin{frame} \frametitle{Analysis of \textsc{Soloist} model}
\begin{block}{Example of wrong belief state prediction}
\textsf{USER:} I need an expensive place to eat in the west.\\
\textsf{SYSTEM:} Is there a specific type of food you would like?\\
\textsf{USER:} yes, i would like eat indian food.\\
\textbf{True states:} (area, west),(food, indian),(pricerange, expensive)
\textbf{Generated:} \textsl{(area, west),(food, indian),(pricerange, {\color{red} cheap}),({\color{red}area, east})}
\end{block}
\begin{itemize}
\item Open-ended generation
\item Susceptible to generating random slot-value pairs
\item Repeated slot-value generations
\item From the above example:
\begin{itemize}
\item slot \textsl{area} is repeated with a different value
\item value for slot \textsl{pricerange} is incorrect
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame} \frametitle{Analysis of prompt-based methods}
\begin{block}{Incorrect slot generations by value-based prompt}
\textsf{USER:} I need to be picked up from pizza hut city centre after 04:30\\
\textbf{True states:} (departure, pizza hut city centre), (leave, 04:30)
\textbf{Generated:} \textsl{({\color{red}destination}, pizza hut city centre), ({\color{red}arrive}, 04:30)}
\end{block}
\begin{itemize}
\item Incorrect slots generated under low-resource splits {\small (i.e., \textsl{5-dpd,10-dpd})}
\item Model struggled to distinguish between slots:
\begin{itemize}
\item \textsl{departure vs destination}
\item \textsl{leave vs arrive}
\end{itemize}
\item Possibly due to limited training data
\end{itemize}
\end{frame}
\begin{frame} \frametitle{Limitations of Value-based prompt}
\begin{block}{Repeated Values in Belief States}
\textsf{USER:} hi, can you help me find a 3 star place to stay?\\
\textsf{SYSTEM:} Is there a particular area or price range you prefer?\\
\textsf{USER:} how about a place in centre of town that is of type hotel.\\
\textsf{SYSTEM:} how long would you like to stay, and how many people?\\
\textsf{USER:} Ill arrive on saturday and stay for 3 nights with 3 people.\\
\textbf{True states:} (area, centre), (stars, \underline{3}), (type, hotel), (day, saturday), \\(stay, \underline{3}), (people, \underline{3})
\end{block}
\begin{itemize}
\item User requirements may have repeated values in belief states
\item Value for \textsl{stars}, \textsl{stay}, and \textsl{people} is the same
\item Value-based prompt can only generate one slot for all the repeated values
\end{itemize}
\end{frame}
\begin{frame} \frametitle{Error Analysis of Value Extraction}
\begin{block}{Problems with Value Extraction}
\textsf{USER:} I want a place to stay that has free wifi and free parking.\\
\textsf{SYSTEM:} do you have a preference for area or price range?\\
\textsf{USER:} I dont have a preference. I want a hotel not guesthouse.\\
\textbf{True states:} (area, \underline{dont care}), (internet, \underline{yes}), (parking, \underline{yes}), \\(price, \underline{dont care}), (type, hotel)\\
\textbf{Extracted Values:} \textsl{free}, \textsl{hotel}\\
\hrulefill \\
\textsf{USER:} I kind of need help finding a nice hotel in the north part of town.\\
\textbf{True states:} (area, north), (price, expensive), (type, hotel)\\
\textbf{Extracted Values:} \textsl{\color{red}kind}, \textsl{\color{red}nice}, \textsl{hotel}, \textsl{north}
\end{block}
\begin{itemize}
\item Value Extraction on test split
\begin{itemize}
\item Accuracy of \textsl{79\%} on all the values
\item Turn-level accuracy of \textsl{49\%}
\end{itemize}
\item Drawbacks of extracting values from POS tags
\end{itemize}
\end{frame}
\section{Conclusion}
\begin{frame} \frametitle{Conclusion}
\begin{itemize}
\item Prompt-based methods learned the DST task efficiently under low-resource few-shot settings without relying on the ontology.
\item Prompt-based methods significantly outperformed the baseline \textsc{Soloist} model under low-resource settings.
\item Some limitations in the prompt-based approach
\item Prompt Ensemble model only achieved minor improvements over single value-based prompt
\item Performance of Prompt Augmentation is limited due to insufficient demonstration examples
\end{itemize}
\end{frame}
\begin{frame} \frametitle{Future work}
\begin{itemize}
\item Explore automated prompt search methods for choosing the right prompts instead of manually creating the templates
\item Improve the value extraction methods
\begin{itemize}
\item Combination of text summarization and semantic tagging
\end{itemize}
\item Can bigger language models perform better in prompting the DST task?
\end{itemize}
\end{frame}
\section{}
\begin{frame}[plain,noframenumbering,allowframebreaks]
\frametitle{References}
\printbibliography[heading=none]
\end{frame}
\section{}
\begin{frame}
\centering \Large
\emph{Thanks for your time!}
\end{frame}
\end{document}
%% --- END OF FILE