master-thesis/presentation/latex/talk.tex

\documentclass[10pt]{beamer}


\usepackage{beamerthemesplit}
\usepackage[utf8]{inputenc}

\usepackage[font=small,figurename=Fig]{caption}

\usepackage{graphicx}
\graphicspath{ {images/} }

\usepackage[style=authoryear, backend=biber]{biblatex}
\addbibresource{bibliography.bib}

\usetheme{Frankfurt}
\usecolortheme{default}

\title[Prompt-based methods for DST]{Prompt-based methods for Dialog State Tracking}
\subtitle{Thesis Presentation}

\author[Pavan Mandava]{Mandava, Sai Pavan}

\institute{Institut für Maschinelle Sprachverarbeitung\\
  Universität Stuttgart}

\date[Thesis Presentation]{15.02.2023}

\AtBeginSection[]
{
   \begin{frame}
       \frametitle{Outline}
       \tableofcontents[currentsection]
   \end{frame}
}

\begin{document}


\frame{\titlepage}

\begin{frame}{Outline}
  \tableofcontents
\end{frame}

\section{Introduction \& Motivation}

\begin{frame} \frametitle{Introduction}

\begin{itemize}
	\item Task-oriented dialog systems
	\begin{itemize}
		\item perform a wide range of tasks across multiple domains
		\item \textsl{E.g. ticket booking, restaurant booking, etc.}
	\end{itemize}
	\item Modular-based dialog systems
	\begin{itemize}
		\item NLU, DST, PL, NLG
	\end{itemize}
\end{itemize}
\vspace{8pt}
\begin{figure}
    \centering
    \includegraphics[width=0.8\textwidth]{modular_tod.png}
    \caption{Modular-based task-oriented dialog system}
\end{figure}

\end{frame}


\begin{frame} \frametitle{Dialog State Tracking (DST)}

\begin{itemize}
	\item Essential module for the dialog system to understand user's requests
	\item Tracks the user goals in the form of dialog states (or ``belief states")
	\item Dialog states contains a set of \textsl{(slot, value)} pairs
	\begin{itemize}
		\item Updated at each turn of the conversation
	\end{itemize}
\end{itemize}

\begin{block} {DST Example}
	\textsf{\textbf{USER:}} Plan a train trip to Berlin this Friday for two people.\\
	\textbf{Belief states:} \{(\textsl{destination, Berlin}), (\textsl{day, Friday}), (\textsl{people, 2})\}
\end{block}

\begin{itemize}
	\item Ontology of domains
	\begin{itemize}
		\item Represents knowledge \& information required for specific tasks
		\item Contains pre-defined set of slots and all possible values for each slot
		\item Some Neural-based models solve the DST as classification task
	\end{itemize}
	\item Problems with depending on ontology
	\begin{itemize}
		\item Ontology is hard to obtain for new domains
		\item Costly and time-consuming
	\end{itemize}
\end{itemize}

\end{frame}

\begin{frame} \frametitle{PLMs \& Prompt Learning}

	\begin{itemize}
		\item Pre-trained Language Models (PLMs)
		\begin{itemize}
			\item Trained on large amounts of textual data
			\item Encode linguistic knowledge into the huge amount of parameters
			\item Can be efficiently used to solve NLP tasks
			\item BERT\parencite{devlin2019bert}, GPT-2\parencite{radford2019gpt2}, GPT-3\parencite{brown2020gpt3}
		\end{itemize}
		\item Prompt Learning
		\begin{itemize}
			\item New way of efficiently using the generation capabilities of PLMs to solve different language tasks
			\item Downstream task is converted to a textual prompt and given as input, the PLM directly generates the outputs from prompts
			\item Prompting methods can be effectively used under \textsl{zero-shot} and \textsl{few-shot} settings when there's not enough training data
			\item GPT-3 \parencite{brown2020gpt3}, Few-shot Bot \parencite{madotto2021fsb}, \textsc{PET} \parencite{schick2021pet} explored prompt-based methods for several tasks
		\end{itemize}
	\end{itemize}

\end{frame}

\begin{frame} \frametitle{Prompt Learning (contd.)}
\begin{figure}
    \centering
    \includegraphics[width=0.75\textwidth]{prompt_terminology.png}
    \caption{Terminology and notations in prompt learning}
\end{figure}
\vspace{-4pt}
\begin{itemize}
	\item Prompt Types: \textsl{prefix} \& \textsl{cloze} prompts
	\item Prompt selection: manual, discrete, \& continuous prompts
	\item Training strategy: Fixed-prompt LM Fine Tuning
	\begin{itemize}
		\item fixed prompts are applied to training data and fine-tune the LM
		\item under low-resource few-shot settings
	\end{itemize}
\end{itemize}

\end{frame}

\begin{frame} \frametitle{Motivation \& Objectives}

\begin{itemize}
	\item Previous work \& their limitations
	\begin{itemize}
		\item \textsc{TOD-BERT} \parencite{wu2020tod-bert}
		\begin{itemize}
			\item Pre-trained BERT on 9 different task-oriented datasets
			\item Fine-tuned for DST task as multi-class classification
			\item Depends on the ontology of domains for predicting slot-values
		\end{itemize}
		\item \textsc{Soloist} \parencite{peng2021soloist}
		\begin{itemize}
			\item Pre-trained GPT-2 for two dialogue datasets
			\item Fine-tuned to generate belief states as sequence of words
			\item Performs poorly under low-resource settings
		\end{itemize}
	\end{itemize}
	\item Research Objectives
	\begin{itemize}
		\item Can the dialog states be extracted from the PLM using prompts?
		\item Can the prompt-based methods learn the DST task under low-resource settings without depending on the ontology of domains?
		\item Compare prompt-based approach with the baseline model
		\item Identify the drawbacks \& limitations of prompt-based approach
		\item Can different multi-prompt techniques help improve the performance of DST task?
	\end{itemize}

\end{itemize}

\end{frame}

\section{Methods}

\begin{frame} \frametitle{Dataset - MultiWOZ \parencite{budzianowski2018multiwoz}}
	\begin{itemize}
		\item MultiWOZ 2.1 \parencite{eric2019multiwoz} is used to benchmark the DST
		\item Contains huge number of dialogues across multiple domains
		\item Each Dialog $\rightarrow$ multiple turns $\rightarrow$ multiple \textsl{(slot,value)} pairs
		\item Five domains are picked for few-shot experiments
		\begin{itemize}
			\item \textsl{Restaurant, Hotel, Attraction, Taxi, Train}
		\end{itemize}
		\item Six data splits are created to perform few-shot experiments
		\begin{itemize}
			\item Different proportions of dialogues in each split
			\item All the five domains are evenly distributed in each split
		\end{itemize}
	\end{itemize}

	\begin{figure}
    \centering
    \includegraphics[width=0.75\textwidth]{data_splits.png}
    %% \caption{Terminology and notations in prompt learning}
	\end{figure}

\end{frame}

\begin{frame} \frametitle{Baseline (\textsc{Soloist})}

	\begin{itemize}
		\item \textsc{Soloist} \parencite{peng2021soloist} is the baseline model
		\item Initialized with 12-layer GPT-2 language model
		\item Pre-training step
		\begin{itemize}
			\item Pre-trained on two task-oriented dialogue datasets
			\item Pre-trained model is publicly available
		\end{itemize}
		\item Fine-tuning step
		\begin{itemize}
			\item Fine-tuned on all MultiWOZ 2.1 data splits to perform the belief predictions task
			\item Takes dialog history as input and generates belief states as sequence of words
			\item \textsl{belief: $slot_1 = value_1; slot_2 = value_2, \ldots$}
		\end{itemize}

	\end{itemize}

\end{frame}

\begin{frame} \frametitle{Prompt-based methods}

	\begin{itemize}
		\item \cite{yang2022prompt} proposed prompt learning framework for DST
		\item This approach doesn't depend on the ontology of domains
		\item Two components: \textsl{value-based prompt} and \textsl{inverse prompt}
		\item Value-based prompt uses belief state values in prompts and generates the slots from PLM
		\item Inverse prompt is an auxiliary task that uses the slot generated from value-based prompt and attempts to generate back the value.
	\end{itemize}

	\begin{figure}
    \centering
    \includegraphics[width=0.85\textwidth]{prompt_methods.png}
    %% \caption{Terminology and notations in prompt learning}
	\end{figure}

\end{frame}

\begin{frame} \frametitle{Prompt-based methods - Training}
	\begin{table}[h!]
	\centering
	\begingroup
	\setlength{\tabcolsep}{8pt} % Default value: 6pt
	\renewcommand{\arraystretch}{1.1} % Default value: 1
	\begin{tabular}{ll}
		\hline
		\multicolumn{1}{c}{\textbf{Type}} & \multicolumn{1}{c}{\textbf{Prompt templates}} \\
		\hline
		value-based prompt & belief states: value = [v], slot = [s] \\
		inverse prompt  & belief states: slot = [s], value = [v] \\
		\hline
	\end{tabular}
	\endgroup
	\end{table}

	\begin{itemize}

		\item The pre-trained Soloist is used to fine-tune the prompting methods
		\item All MultiWOZ data splits are used in the fine-tuning phase
		\item Loss function for value-based prompt
		$$\mathcal{L}=-\sum_{t}^{|D|} \log P\left(s_{t} \mid c_{t}, f\left(v_{t}\right)\right)$$
		\item Loss function for inverse prompt
		$$\tilde{\mathcal{L}}=-\sum_{t}^{|D|} \log P\left(v^{\prime}_{t} \mid c_{t}, I\left(s_{t}\right)\right)$$
		\item Total Loss: $\mathcal{L}^{*} = \mathcal{L} + w *\tilde{\mathcal{L}}$
		\begin{itemize}
			\item Experiments are performed on different inverse prompt weights $w$
		\end{itemize}

	\end{itemize}

\end{frame}

\begin{frame} \frametitle{Prompt-based methods - Testing}

	\begin{itemize}
		\item Testing slot generation
		\begin{itemize}
			\item During inference, only value-based prompts are used
			\item Prompts are filled with values and given as input to PLM
			\item Next word with the highest probability is the generated slot
			\item Rule-based approach for extracting value candidates
		\end{itemize}
		\item Rule-based Value Extraction:
		\begin{itemize}
			\item Stanford CoreNLP Stanza is used to first extract POS tags
			\item Adjectives \textsf{(JJ)} and Adverbs \textsf{(RB)} are considered as possible values
			\item Consider the previous negator `not'
			\item Consider all named entities (name of place, time, day, numbers)
			\item Custom Regex NER rules, filtered stop words and repeated values
		\end{itemize}
	\end{itemize}

	\begin{figure}
    \centering
    \includegraphics[width=0.72\textwidth]{corenlp.png}
	\end{figure}

\end{frame}

\begin{frame} \frametitle{Multi-prompt method (Prompt Ensemble)}
	\begin{itemize}
		\item Only a single value-based prompt is used in the previous experiments
		\item Multiple prompts can be used together to improve the performance
		\item Prompt Ensembling uses multiple value-based prompts during training and inference to take advantage of different prompts
		\item Four hand-crafted prompt templates for value-based prompt
	\end{itemize}

	\begin{table}
	\centering
	\begin{tabular}{c l}
	\hline
	\multicolumn{2}{c}{\textbf{Prompt ensemble templates}}\\
	\hline
	 $f_{1}$ & belief states: [v] = [s]\\
	 $f_{2}$ & [v] is the value of [s]\\
	 $f_{3}$ & [v] is of slot type [s]\\
	 $f_{4}$ & belief states: value = [v], slot = [s]\\
	\hline
	\end{tabular}
	\end{table}

	\begin{itemize}
		\item A single model is trained with multiple prompts
		\item The probability of generated slot over multiple prompt functions:
		$$P\left(s_{t} \mid c_{t}\right)=\sum_{k}^{|K|} \alpha_{k} * P\left(s_{t} \mid c_{t}, f_{k}\left(v_{t}\right)\right)$$
	\end{itemize}

\end{frame}

\begin{frame} \frametitle{Multi-prompt method (Prompt Augmentation)}
	\begin{itemize}
		\item Provides a few additional answered prompts that can demonstrate to the PLM how the actual task can be performed
		\item Sample selection is manually hand-picked from training data
		\item Experiments are performed on two sets of demonstration samples
		\begin{itemize}
			\item Sample set 1: 8 demonstrations
			\item Sample set 2: 5 demonstrations
		\end{itemize}
		\item Demonstrations are concatenated to the input during inference
		\item Number of demonstration examples that can be used is bounded by the GPT-2 max input length of 1024
	\end{itemize}

	\begin{table}
	\centering
	\begingroup
	\setlength{\tabcolsep}{2pt}
	\begin{tabular}{ r l }
	 \hline
	 \multicolumn{2}{c}{\textbf{Demonstration learning}} \\
	 \hline
	 Book a cheap flight to Frankfurt. & \textit{Frankfurt} is of slot \textit{destination}\\
	 Plan a train trip to Berlin. & \textit{Berlin} is of slot \textit{destination}\\
	 Book a taxi to the University. & \textit{University} is of slot \textit{destination}\\
	 Book a train to Stuttgart. & \textit{Stuttgart} is of slot [s]\\
	 \hline
	\end{tabular}
	\endgroup
	\end{table}
\end{frame}

\begin{frame} \frametitle{Evaluation Metrics}

	\begin{itemize}
		\item Joint Goal Accuracy (JGA)
		\begin{itemize}
			\item Standard evaluation metric for DST
			\item Correct if all the predicted belief states match with the ground-truth
			\item All the slots and values must exactly match
		\end{itemize}
		\item Rule-based value extraction methods may extract irrelevant values
		\item JGA* \parencite{yang2022prompt}
		\begin{itemize}
			\item To exclude the influence of wrongly extracted values, JGA* is used
			\item JGA* - Joint Goal Accuracy is computed only for the belief states where the values are extracted correctly
		\end{itemize}
	\end{itemize}

\end{frame}

\section{Results}
\begin{frame} \frametitle{Baseline (\textsc{Soloist}) results}

	\begin{figure}
    \centering
    \includegraphics[width=0.9\textwidth]{baseline_results.png}
	\end{figure}

\end{frame}

\begin{frame} \frametitle{ Prompt-based methods}

	\begin{figure}
    \centering
    \includegraphics[width=0.8\textwidth]{prompt_results.png}
	\end{figure}

\end{frame}

\begin{frame} \frametitle{Prompt Ensemble results}

	\begin{figure}
    \centering
    \includegraphics[width=0.9\textwidth]{ensemble_results.png}
	\end{figure}

\end{frame}

\begin{frame} \frametitle{Prompt Augmentation results}

	\begin{figure}
    \centering
    \includegraphics[width=0.9\textwidth]{demonstration_results.png}
	\end{figure}

\end{frame}

\begin{frame} \frametitle{Comparison of results}

	\begin{figure}
    \centering
    \includegraphics[width=0.83\textwidth]{comparison_results.png}
	\end{figure}

\end{frame}

\section{Discussion}

\begin{frame} \frametitle{Analysis of \textsc{Soloist} model}

	\begin{block}{Example of wrong belief state prediction}
		\textsf{USER:} I need an expensive place to eat in the west.\\
	    \textsf{SYSTEM:} Is there a specific type of food you would like?\\
		\textsf{USER:} yes, i would like eat indian food.\\
		\textbf{True states:} (area, west),(food, indian),(pricerange, expensive)
		\textbf{Generated:} \textsl{(area, west),(food, indian),(pricerange, {\color{red} cheap}),({\color{red}area, east})}
	\end{block}

	\begin{itemize}
		\item Open-ended generation
		\item Susceptible to generating random slot-value pairs
		\item Repeated slot-value generations
		\item From the above example:
		\begin{itemize}
			\item slot \textsl{area} is repeated with a different value
			\item value for slot \textsl{pricerange} is incorrect
		\end{itemize}
	\end{itemize}

\end{frame}

\begin{frame} \frametitle{Analysis of prompt-based methods}

	\begin{block}{Incorrect slot generations by value-based prompt}
		\textsf{USER:} I need to be picked up from pizza hut city centre after 04:30\\
		\textbf{True states:} (departure, pizza hut city centre), (leave, 04:30)
		\textbf{Generated:} \textsl{({\color{red}destination}, pizza hut city centre), ({\color{red}arrive}, 04:30)}
	\end{block}

	\begin{itemize}
		\item Incorrect slots generated under low-resource splits {\small (i.e., \textsl{5-dpd,10-dpd})}
		\item Model struggled to distinguish between slots:
		\begin{itemize}
			\item \textsl{departure vs destination}
			\item \textsl{leave vs arrive}
		\end{itemize}
		\item Possibly due to limited training data
	\end{itemize}

\end{frame}

\begin{frame} \frametitle{Limitations of Value-based prompt}

	\begin{block}{Repeated Values in Belief States}
		\textsf{USER:} hi, can you help me find a 3 star place to stay?\\
		\textsf{SYSTEM:} Is there a particular area or price range you prefer?\\
		\textsf{USER:} how about a place in centre of town that is of type hotel.\\
		\textsf{SYSTEM:} how long would you like to stay, and how many people?\\
		\textsf{USER:} I’ll arrive on saturday and stay for 3 nights with 3 people.\\
		\textbf{True states:} (area, centre), (stars, \underline{3}), (type, hotel), (day, saturday), \\(stay, \underline{3}), (people, \underline{3})
	\end{block}

	\begin{itemize}
		\item User requirements may have repeated values in belief states
		\item Value for \textsl{stars}, \textsl{stay}, and \textsl{people} is the same
		\item Value-based prompt can only generate one slot for all the repeated values
	\end{itemize}

\end{frame}

\begin{frame} \frametitle{Error Analysis of Value Extraction}

	\begin{block}{Problems with Value Extraction}
		\textsf{USER:} I want a place to stay that has free wifi and free parking.\\
		\textsf{SYSTEM:} do you have a preference for area or price range?\\
		\textsf{USER:} I don’t have a preference. I want a hotel not guesthouse.\\
		\textbf{True states:} (area, \underline{dont care}), (internet, \underline{yes}), (parking, \underline{yes}), \\(price, \underline{dont care}), (type, hotel)\\
		\textbf{Extracted Values:} \textsl{free}, \textsl{hotel}\\
		\hrulefill \\
		\textsf{USER:} I kind of need help finding a nice hotel in the north part of town.\\
		\textbf{True states:} (area, north), (price, expensive), (type, hotel)\\
		\textbf{Extracted Values:} \textsl{\color{red}kind}, \textsl{\color{red}nice}, \textsl{hotel}, \textsl{north}
	\end{block}

	\begin{itemize}
		\item Value Extraction on test split
		\begin{itemize}
			\item Accuracy of \textsl{79\%} on all the values
			\item Turn-level accuracy of \textsl{49\%}
		\end{itemize}
		\item Drawbacks of extracting values from POS tags
	\end{itemize}

\end{frame}

\section{Conclusion}
\begin{frame} \frametitle{Conclusion}

	\begin{itemize}
		\item Prompt-based methods learned the DST task efficiently under low-resource few-shot settings without relying on the ontology.
		\item Prompt-based methods significantly outperformed the baseline \textsc{Soloist} model under low-resource settings.
		\item Some limitations in the prompt-based approach
		\item Prompt Ensemble model only achieved minor improvements over single value-based prompt
		\item Performance of Prompt Augmentation is limited due to insufficient demonstration examples
	\end{itemize}

\end{frame}

\begin{frame} \frametitle{Future work}

	\begin{itemize}
		\item Explore automated prompt search methods for choosing the right prompts instead of manually creating the templates
		\item Improve the value extraction methods
		\begin{itemize}
			\item Combination of text summarization and semantic tagging
		\end{itemize}
		\item Can bigger language models perform better in prompting the DST task?
	\end{itemize}

\end{frame}

\section{}
\begin{frame}[plain,noframenumbering,allowframebreaks]
	\frametitle{References}
    \printbibliography[heading=none]
\end{frame}

\section{}
\begin{frame}
  \centering \Large
  \emph{Thanks for your time!}
\end{frame}
\end{document}


%% --- END OF FILE