\documentclass[10pt]{beamer} \usepackage{beamerthemesplit} \usepackage[utf8]{inputenc} \usepackage[font=small,figurename=Fig]{caption} \usepackage{graphicx} \graphicspath{ {images/} } \usepackage[style=authoryear, backend=biber]{biblatex} \addbibresource{bibliography.bib} \usetheme{Frankfurt} \usecolortheme{default} \title[Prompt-based methods for DST]{Prompt-based methods for Dialog State Tracking} \subtitle{Thesis Presentation} \author[Pavan Mandava]{Mandava, Sai Pavan} \institute{Institut für Maschinelle Sprachverarbeitung\\ Universität Stuttgart} \date[Thesis Presentation]{15.02.2023} \AtBeginSection[] { \begin{frame} \frametitle{Outline} \tableofcontents[currentsection] \end{frame} } \begin{document} \frame{\titlepage} \begin{frame}{Outline} \tableofcontents \end{frame} \section{Introduction \& Motivation} \begin{frame} \frametitle{Introduction} \begin{itemize} \item Task-oriented dialog systems \begin{itemize} \item perform a wide range of tasks across multiple domains \item \textsl{E.g. ticket booking, restaurant booking, etc.} \end{itemize} \item Modular-based dialog systems \begin{itemize} \item NLU, DST, PL, NLG \end{itemize} \end{itemize} \vspace{8pt} \begin{figure} \centering \includegraphics[width=0.8\textwidth]{modular_tod.png} \caption{Modular-based task-oriented dialog system} \end{figure} \end{frame} \begin{frame} \frametitle{Dialog State Tracking (DST)} \begin{itemize} \item Essential module for the dialog system to understand user's requests \item Tracks the user goals in the form of dialog states (or ``belief states") \item Dialog states contains a set of \textsl{(slot, value)} pairs \begin{itemize} \item Updated at each turn of the conversation \end{itemize} \end{itemize} \begin{block} {DST Example} \textsf{\textbf{USER:}} Plan a train trip to Berlin this Friday for two people.\\ \textbf{Belief states:} \{(\textsl{destination, Berlin}), (\textsl{day, Friday}), (\textsl{people, 2})\} \end{block} \begin{itemize} \item Ontology of domains \begin{itemize} \item Contains pre-defined set of slots and all possible values for each slot \item Some Neural-based models solve the DST as classification task \end{itemize} \item Problems with depending on ontology \begin{itemize} \item Ontology is hard to obtain for new domains \item Costly and time-consuming \end{itemize} \end{itemize} \end{frame} \begin{frame} \frametitle{PLMs \& Prompt Learning} \begin{itemize} \item Pre-trained Language Models (PLMs) \begin{itemize} \item Trained on large amounts of textual data \item Encode linguistic knowledge into the huge amount of parameters \item Can be efficiently used to solve NLP tasks \item BERT\parencite{devlin2019bert}, GPT-2\parencite{radford2019gpt2}, GPT-3\parencite{brown2020gpt3} \end{itemize} \item Prompt Learning \begin{itemize} \item New way of efficiently using the generation capabilities of PLMs to solve different language tasks \item Downstream task is converted to a textual prompt and given as input, the PLM directly generates the outputs from prompts \item GPT-3 \parencite{brown2020gpt3}, Few-shot Bot \parencite{madotto2021fsb}, \textsc{PET} \parencite{schick2021pet} explored prompt-based methods for several tasks \end{itemize} \end{itemize} \end{frame} \begin{frame} \frametitle{Prompt Learning (contd.)} \begin{figure} \centering \includegraphics[width=0.75\textwidth]{prompt_terminology.png} \caption{Terminology and notations in prompt learning} \end{figure} \vspace{-4pt} \begin{itemize} \item Prompt selection: manual, discrete, \& continuous prompts \item Training strategy: Fixed-prompt LM Fine Tuning \begin{itemize} \item fixed prompts are applied to training data and fine-tune the LM \item under low-resource few-shot settings \end{itemize} \end{itemize} \end{frame} \begin{frame} \frametitle{Motivation \& Objectives} \begin{itemize} \item Previous work \& their limitations \begin{itemize} \item \textsc{TOD-BERT} \parencite{wu2020tod-bert} \begin{itemize} \item Pre-trained BERT on 9 different task-oriented datasets \item Fine-tuned for DST task as multi-class classification \item Depends on the ontology of domains for predicting slot-values \end{itemize} \item \textsc{Soloist} \parencite{peng2021soloist} \begin{itemize} \item Pre-trained GPT-2 for two dialogue datasets \item Fine-tuned to generate belief states as sequence of words \item Performs poorly under low-resource settings \end{itemize} \end{itemize} \item Research Objectives \begin{itemize} \item Can the prompt-based methods learn the DST task efficiently under low-resource settings without depending on the ontology? \item Compare prompt-based approach with the baseline model \item Identify the drawbacks \& limitations of prompt-based approach \item Can different multi-prompt techniques help improve the performance of DST task? \end{itemize} \end{itemize} \end{frame} \section{Methods} \begin{frame} \frametitle{Dataset - MultiWOZ \parencite{budzianowski2018multiwoz}} \begin{itemize} \item MultiWOZ 2.1 \parencite{eric2019multiwoz} is used to benchmark the DST \item Contains huge number of dialogues across multiple domains \item Each Dialog $\rightarrow$ multiple turns $\rightarrow$ multiple \textsl{(slot,value)} pairs \item Five domains are picked for few-shot experiments \begin{itemize} \item \textsl{Restaurant, Hotel, Attraction, Taxi, Train} \end{itemize} \item Six data splits are created to perform few-shot experiments \begin{itemize} \item Different proportions of dialogues in each split \end{itemize} \end{itemize} \begin{figure} \centering \includegraphics[width=0.75\textwidth]{data_splits.png} %% \caption{Terminology and notations in prompt learning} \end{figure} \end{frame} \begin{frame} \frametitle{Baseline (\textsc{Soloist})} \begin{itemize} \item \textsc{Soloist} \parencite{peng2021soloist} is the baseline model \item Initialized with 12-layer GPT-2 language model \item Pre-training step \begin{itemize} \item Pre-trained on two task-oriented dialogue datasets \item Pre-trained model is publicly available \end{itemize} \item Fine-tuning step \begin{itemize} \item Fine-tuned on all MultiWOZ 2.1 data splits to perform the belief predictions task \item Takes dialog history as input and generates belief states as sequence of words \item \textsl{belief: $slot_1 = value_1; slot_2 = value_2, \ldots$} \end{itemize} \end{itemize} \end{frame} \begin{frame} \frametitle{Prompt-based methods} \begin{itemize} \item \cite{yang2022prompt} proposed prompt learning framework for DST \item This approach doesn't depend on the ontology of domains \item Two components: \textsl{value-based prompt} and \textsl{inverse prompt} \item Value-based prompt uses belief state values in prompts and generates the slots from PLM \item Inverse prompt is an auxiliary task that uses the slot generated from value-based prompt and attempts to generate back the value. \end{itemize} \begin{figure} \centering \includegraphics[width=0.85\textwidth]{prompt_methods.png} %% \caption{Terminology and notations in prompt learning} \end{figure} \end{frame} \begin{frame} \frametitle{Prompt-based methods - Training} \begin{table}[h!] \centering \begingroup \setlength{\tabcolsep}{8pt} % Default value: 6pt \renewcommand{\arraystretch}{1.1} % Default value: 1 \begin{tabular}{ll} \hline \multicolumn{1}{c}{\textbf{Type}} & \multicolumn{1}{c}{\textbf{Prompt templates}} \\ \hline value-based prompt & belief states: value = [v], slot = [s] \\ inverse prompt & belief states: slot = [s], value = [v] \\ \hline \end{tabular} \endgroup \end{table} \begin{itemize} \item The pre-trained Soloist is used to fine-tune the prompting methods \item Loss function for value-based prompt $$\mathcal{L}=-\sum_{t}^{|D|} \log P\left(s_{t} \mid c_{t}, f\left(v_{t}\right)\right)$$ \item Loss function for inverse prompt $$\tilde{\mathcal{L}}=-\sum_{t}^{|D|} \log P\left(v^{\prime}_{t} \mid c_{t}, I\left(s_{t}\right)\right)$$ \item Total Loss: $\mathcal{L}^{*} = \mathcal{L} + w *\tilde{\mathcal{L}}$ \begin{itemize} \item Experiments are performed on different inverse prompt weights $w$ \end{itemize} \end{itemize} \end{frame} \begin{frame} \frametitle{Prompt-based methods - Testing} \begin{itemize} \item Testing slot generation \begin{itemize} \item During inference, only value-based prompts are used \item Prompts are filled with values and given as input to PLM \item Next word with the highest probability is the generated slot \item Rule-based approach for extracting value candidates \end{itemize} \item Rule-based Value Extraction: \begin{itemize} \item Stanford CoreNLP Stanza is used to first extract POS tags \item Adjectives \textsf{(JJ)} and Adverbs \textsf{(RB)} are considered as possible values \item Consider the previous negator `not' \item Consider all named entities (name of place, time, day, numbers) \item Custom Regex NER rules, filtered stop words and repeated values \end{itemize} \end{itemize} \begin{figure} \centering \includegraphics[width=0.72\textwidth]{corenlp.png} \end{figure} \end{frame} \begin{frame} \frametitle{Multi-prompt method (Prompt Ensemble)} \begin{itemize} \item Only a single value-based prompt is used in the previous experiments \item Multiple prompts can be used together to improve the performance \item Prompt Ensembling uses multiple value-based prompts during training and inference to take advantage of different prompts \item Four hand-crafted prompt templates for value-based prompt \end{itemize} \begin{table} \centering \begin{tabular}{c l} \hline \multicolumn{2}{c}{\textbf{Prompt ensemble templates}}\\ \hline $f_{1}$ & belief states: [v] = [s]\\ $f_{2}$ & [v] is the value of [s]\\ $f_{3}$ & [v] is of slot type [s]\\ $f_{4}$ & belief states: value = [v], slot = [s]\\ \hline \end{tabular} \end{table} \begin{itemize} \item A single model is trained with multiple prompts \item The probability of generated slot over multiple prompt functions: $$P\left(s_{t} \mid c_{t}\right)=\sum_{k}^{|K|} \alpha_{k} * P\left(s_{t} \mid c_{t}, f_{k}\left(v_{t}\right)\right)$$ \end{itemize} \end{frame} \begin{frame} \frametitle{Multi-prompt method (Prompt Augmentation)} \begin{itemize} \item Provides a few additional answered prompts that can demonstrate to the PLM how the actual task can be performed \item Sample selection is manually hand-picked from training data \item Experiments are performed on two sets of demonstration samples \begin{itemize} \item Sample set 1: 8 demonstrations \item Sample set 2: 5 demonstrations \end{itemize} \item Demonstrations are concatenated to the input during inference \item Number of demonstration examples that can be used is bounded by the GPT-2 max input length of 1024 \end{itemize} \begin{table} \centering \begingroup \setlength{\tabcolsep}{2pt} \begin{tabular}{ r l } \hline \multicolumn{2}{c}{\textbf{Demonstration learning}} \\ \hline Book a cheap flight to Frankfurt. & \textit{Frankfurt} is of slot \textit{destination}\\ Plan a train trip to Berlin. & \textit{Berlin} is of slot \textit{destination}\\ Book a taxi to the University. & \textit{University} is of slot \textit{destination}\\ Book a train to Stuttgart. & \textit{Stuttgart} is of slot [s]\\ \hline \end{tabular} \endgroup \end{table} \end{frame} \begin{frame} \frametitle{Evaluation Metrics} \begin{itemize} \item Joint Goal Accuracy (JGA) \begin{itemize} \item Standard evaluation metric for DST \item Correct if all the predicted belief states match with the ground-truth \item All the slots and values must exactly match \end{itemize} \item Rule-based value extraction methods may extract irrelevant values \item JGA* \parencite{yang2022prompt} \begin{itemize} \item To exclude the influence of wrongly extracted values, JGA* is used \item JGA* - Joint Goal Accuracy is computed only for the belief states where the values are extracted correctly \end{itemize} \end{itemize} \end{frame} \section{Results} \begin{frame} \frametitle{Baseline (\textsc{Soloist}) results} \begin{figure} \centering \includegraphics[width=0.9\textwidth]{baseline_results.png} \end{figure} \end{frame} \begin{frame} \frametitle{ Prompt-based methods} \begin{figure} \centering \includegraphics[width=0.8\textwidth]{prompt_results.png} \end{figure} \end{frame} \begin{frame} \frametitle{Prompt Ensemble results} \begin{figure} \centering \includegraphics[width=0.9\textwidth]{ensemble_results.png} \end{figure} \end{frame} \begin{frame} \frametitle{Prompt Augmentation results} \begin{figure} \centering \includegraphics[width=0.9\textwidth]{demonstration_results.png} \end{figure} \end{frame} \begin{frame} \frametitle{Comparison of results} \begin{figure} \centering \includegraphics[width=0.83\textwidth]{comparison_results.png} \end{figure} \end{frame} \section{Discussion} \begin{frame} \frametitle{Analysis of \textsc{Soloist} model} \begin{block}{Example of wrong belief state prediction} \textsf{USER:} I need an expensive place to eat in the west.\\ \textsf{SYSTEM:} Is there a specific type of food you would like?\\ \textsf{USER:} yes, i would like eat indian food.\\ \textbf{True states:} (area, west),(food, indian),(pricerange, expensive) \textbf{Generated:} \textsl{(area, west),(food, indian),(pricerange, {\color{red} cheap}),({\color{red}area, east})} \end{block} \begin{itemize} %% \item Open-ended generation \item Susceptible to generating random slot-value pairs \item Repeated slot-value generations \item From the above example: \begin{itemize} \item slot \textsl{area} is repeated with a different value \item value for slot \textsl{pricerange} is incorrect \end{itemize} \end{itemize} \end{frame} \begin{frame} \frametitle{Analysis of prompt-based methods} \begin{block}{Incorrect slot generations by value-based prompt} \textsf{USER:} I need to be picked up from pizza hut city centre after 04:30\\ \textbf{True states:} (departure, pizza hut city centre), (leave, 04:30) \textbf{Generated:} \textsl{({\color{red}destination}, pizza hut city centre), ({\color{red}arrive}, 04:30)} \end{block} \begin{itemize} \item Incorrect slots generated under low-resource splits {\small (i.e., \textsl{5-dpd,10-dpd})} \item Model struggled to distinguish between slots: \begin{itemize} \item \textsl{departure vs destination} \item \textsl{leave vs arrive} \end{itemize} \item Possibly due to limited training data \end{itemize} \end{frame} \begin{frame} \frametitle{Limitations of Value-based prompt} \begin{block}{Repeated Values in Belief States} \textsf{USER:} hi, can you help me find a 3 star place to stay?\\ \textsf{SYSTEM:} Is there a particular area or price range you prefer?\\ \textsf{USER:} how about a place in centre of town that is of type hotel.\\ \textsf{SYSTEM:} how long would you like to stay, and how many people?\\ \textsf{USER:} I’ll arrive on saturday and stay for 3 nights with 3 people.\\ \textbf{True states:} (area, centre), (stars, \underline{3}), (type, hotel), (day, saturday), \\(stay, \underline{3}), (people, \underline{3}) \end{block} \begin{itemize} \item User requirements may have repeated values in belief states \item Value for \textsl{stars}, \textsl{stay}, and \textsl{people} is the same \item Value-based prompt can only generate one slot for all the repeated values \end{itemize} \end{frame} \begin{frame} \frametitle{Error Analysis of Value Extraction} \begin{block}{Problems with Value Extraction} \textsf{USER:} I want a place to stay that has free wifi and free parking.\\ \textsf{SYSTEM:} do you have a preference for area or price range?\\ \textsf{USER:} I don’t have a preference. I want a hotel not guesthouse.\\ \textbf{True states:} (area, \underline{dont care}), (internet, \underline{yes}), (parking, \underline{yes}), \\(price, \underline{dont care}), (type, hotel)\\ \textbf{Extracted Values:} \textsl{free}, \textsl{hotel}\\ \hrulefill \\ \textsf{USER:} I kind of need help finding a nice hotel in the north part of town.\\ \textbf{True states:} (area, north), (price, expensive), (type, hotel)\\ \textbf{Extracted Values:} \textsl{\color{red}kind}, \textsl{\color{red}nice}, \textsl{hotel}, \textsl{north} \end{block} \begin{itemize} \item Value Extraction on test split \begin{itemize} \item Accuracy of \textsl{79\%} on all the values \item Turn-level accuracy of \textsl{49\%} \end{itemize} \item Drawbacks of extracting values from POS tags \end{itemize} \end{frame} \section{Conclusion} \begin{frame} \frametitle{Conclusion} \begin{itemize} \item Prompt-based methods learned the DST task efficiently under low-resource few-shot settings without relying on the ontology. \item Prompt-based methods significantly outperformed the baseline \textsc{Soloist} model under low-resource settings. \item Some limitations in the prompt-based approach \item Prompt Ensemble model only achieved minor improvements over single value-based prompt \item Performance of Prompt Augmentation is limited due to insufficient demonstration examples \end{itemize} \end{frame} \begin{frame} \frametitle{Future work} \begin{itemize} \item Explore automated prompt search methods for choosing the right prompts instead of manually creating the templates \item Improve the value extraction methods \begin{itemize} \item Combination of text summarization and semantic tagging \end{itemize} \item Can bigger language models perform better in prompting the DST task? \end{itemize} \end{frame} \section{} \begin{frame}[plain,noframenumbering,allowframebreaks] \frametitle{References} \printbibliography[heading=none] \end{frame} \section{} \begin{frame} \centering \Large \emph{Thanks for your time!} \end{frame} \end{document} %% --- END OF FILE