diff --git a/writing/latex/references.bib b/writing/latex/references.bib index 3d245e3..18efaee 100644 --- a/writing/latex/references.bib +++ b/writing/latex/references.bib @@ -324,3 +324,38 @@ url={https://openreview.net/forum?id=rygGQyrFvH} year = {2015}, url = {http://arxiv.org/abs/1412.6980} } +@article{ni2021dlds, + author = {Jinjie Ni and + Tom Young and + Vlad Pandelea and + Fuzhao Xue and + Vinay Adiga and + Erik Cambria}, + title = {Recent Advances in Deep Learning Based Dialogue Systems: {A} Systematic + Survey}, + journal = {CoRR}, + volume = {abs/2105.04387}, + year = {2021}, + url = {https://arxiv.org/abs/2105.04387}, + eprinttype = {arXiv}, + eprint = {2105.04387}, + timestamp = {Mon, 31 May 2021 08:19:46 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2105-04387.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +@inproceedings{wu2020tod-bert, + title = "{TOD}-{BERT}: Pre-trained Natural Language Understanding for Task-Oriented Dialogue", + author = "Wu, Chien-Sheng and + Hoi, Steven C.H. and + Socher, Richard and + Xiong, Caiming", + booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)", + month = nov, + year = "2020", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2020.emnlp-main.66", + doi = "10.18653/v1/2020.emnlp-main.66", + pages = "917--929", + abstract = "The underlying difference of linguistic patterns between general text and task-oriented dialogue makes existing pre-trained language models less useful in practice. In this work, we unify nine human-human and multi-turn task-oriented dialogue datasets for language modeling. To better model dialogue behavior during pre-training, we incorporate user and system tokens into the masked language modeling. We propose a contrastive objective function to simulate the response selection task. Our pre-trained task-oriented dialogue BERT (TOD-BERT) outperforms strong baselines like BERT on four downstream task-oriented dialogue applications, including intention recognition, dialogue state tracking, dialogue act prediction, and response selection. We also show that TOD-BERT has a stronger few-shot ability that can mitigate the data scarcity problem for task-oriented dialogue.", +} diff --git a/writing/latex/sections/02_intro.tex b/writing/latex/sections/02_intro.tex index 1266c05..633df21 100644 --- a/writing/latex/sections/02_intro.tex +++ b/writing/latex/sections/02_intro.tex @@ -4,24 +4,12 @@ \paragraph{} Prompt-based learning \textit{(\textquote{pre-train, prompt, and predict})} is a new paradigm in NLP that aims to predict the probability of text directly from the pre-trained LM. This framework is powerful as it allows the language model to be \textit{pre-trained} on massive amounts of raw text, and by defining a new prompting function the model can perform \textit{few-shot} or even \textit{zero-shot} learning \citep{liu2021ppp}. The large pre-trained language models (PLMs) are supposed to be useful in few-shot scenarios where the task-related training data is limited, as they can be probed for task-related knowledge efficiently by using a prompt. One example of such large pre-trained language models is GPT-3 \citep{brown2020gpt3} - \textit{\textquote{Language Models are Few-Shot Learners}}. \citet{madotto2021fsb} created an end-to-end chatbot (Few-Shot Bot) using \textit{prompt-based few-shot learning} learning and achieved comparable results to those of state-of-the-art. Prompting methods are particularly helpful in few-shot learning where domain-related data is limited. \textit{Fixed-prompt LM tuning} is a fine-tuning strategy for downstream tasks, where the LM parameters are tuned with fixed prompts to help LM understand the task. This can be achieved by applying a discrete textual prompt template to the data used for fine-tuning the PLM. -\paragraph{} Prompt-based learning for few-shot DST with limited labeled domains is still under-explored. Recently, \citet{yang2022prompt} proposed a new prompt learning framework for few-shot DST. This work designed a \textit{value-based prompt} and an \textit{inverse prompt} mechanism to efficiently train a DST model for domains with limited training data. This approach doesn't depend on the ontology of slots and the results show that it can generate slots by prompting the tuned PLM and outperforms the existing state-of-the-art methods under few-shot settings. In this thesis, the prompt-based few-shot methods for DST are explored by implementing the following three tasks: -\begin{enumerate} - \item Prompt-based few-shot DST - reproduce the results from \citet{yang2022prompt} - \begin{itemize} - \item[--] Implement prompt-based methods for DST task under few-shot settings - \item[--] Implement a baseline model for comparing the prompt-based methods - \end{itemize} - \item Evaluation and analyses of belief state predictions - \begin{itemize} - \item[--] Evaluate the DST task using Joint Goal Accuracy (JGA) metric - \item[--] Improvements observed from the prompt-based methods - \item[--] Drawbacks of the prompt-based methods - \end{itemize} - \item Extend prompt-based methods to utilize various \textit{multi-prompt} techniques - \begin{itemize} - \item[--] Can different multi-prompt techniques help the PLM better understand the DST task? - \item[--] Evaluation of multi-prompt methods and what's the influence of various multi-prompt techniques? - \end{itemize} -\end{enumerate} +\paragraph{} Prompt-based learning for few-shot DST with limited labeled domains is still under-explored. Recently, \citet{yang2022prompt} proposed a new prompt learning framework for few-shot DST. This work designed a \textit{value-based prompt} and an \textit{inverse prompt} mechanism to efficiently train a DST model for domains with limited training data. This approach doesn't depend on the ontology of slots and the results show that it can generate slots by prompting the tuned PLM and outperforms the existing state-of-the-art methods under few-shot settings. + +\paragraph{} The main research objective of this thesis is to investigate the effectiveness of prompt-based methods for DST and to understand the limitations of this approach. Prompt-based methods are adopted for the DST task to answer the following research questions: \textsf{Q:} Can the dialogue belief states be extracted directly from PLM using prompt-based methods? \textsf{Q:} Can the prompt-based methods learn the DST task under low-resource settings without depending on the ontology of domains? \textsf{Q:} How does the prompt-based approach perform overall compared to a baseline model? \textsf{Q:} What are the drawbacks and limitations of prompt-based methods? \textsf{Q:} Can different multi-prompt techniques help the PLM understand the DST task better? \textsf{Q:} What impact do various multi-prompt methods have on the performance of the DST task? + +\paragraph{} To accomplish the research objectives, the prompt learning framework designed by \citet{yang2022prompt}, which includes a \textit{value-based prompt} and \textit{inverse prompt}, is utilized to generate the belief states by prompting the PLM. Few-shot experiments are performed on different proportions of data to evaluate the prompt-based methods under low-resource settings. A baseline model, which also does not depend on the ontology of dialogue domains, is trained on the DST task to compare with the prompt-based methods. A detailed error analysis is conducted to identify the limitations of prompt-based methods. Further, multi-prompt methods are adopted to help the PLM better understand the DST task. + +\paragraph{} This section introduced the overview of the thesis topic, motivation, and research objectives. The next section presents the background and related work (section \ref{sec:background}) with details on the following topics: dialog state tracking (DST), pre-trained language models (PLMs), the baseline model, prompting methods, and the dataset used. The description of the research methods used in the thesis experiments, including the few-shot experiments of baseline and prompt-based methods, multi-prompt methods, and evaluation metrics, are detailed in section \ref{sec:methods}. Section \ref{sec:results} provides all the few-shot experimental results of the research methods adopted. Analysis and discussion of results are presented in section \ref{sec:analysis}. Finally, the conclusion (section \ref{sec:conclusion}) highlights the summary of the main findings. \clearpage \ No newline at end of file diff --git a/writing/latex/sections/03_background.tex b/writing/latex/sections/03_background.tex index 9a11e43..e5a2658 100644 --- a/writing/latex/sections/03_background.tex +++ b/writing/latex/sections/03_background.tex @@ -2,25 +2,50 @@ \subsection{Dialog State Tracking (DST)} -\paragraph{} Task-oriented dialog systems, both modular and end-to-end systems, are capable of handling a wide range of tasks (such as ticket booking, restaurant booking, etc.) across various domains. A task-oriented dialogue system has stricter requirements for responses because it needs to accurately understand and process the user's message. Therefore, modular methods were suggested as a way to generate responses in a more controlled manner. A typical modular-based system uses a modular pipeline, which has four modules that execute sequentially - Natural Language Understanding (NLU), Dialog State Tracking (DST), Policy Learning (POL), and Natural Language Generation (NLG). The DST module is essential for enabling the system to comprehend the user's requests by tracking them in the form of slots and values (belief states) at every turn. For instance, in a dialogue system that helps users book flights, the system might track slots such as destination, departure, travel date, and number of travelers. By keeping track of these slots and their values, the system can understand the user requirements and provides this information to the next module. For example, consider the user message - \textquote{\textit{Plan a train trip to Berlin this Friday for two people}} - the DST module is supposed to extract (\textit{slot, value}) pairs as follows: \{(\textit{destination, Berlin}), (\textit{day, Friday}), (\textit{people, 2})\}. In this thesis, the focus is on the DST module for extracting slots and values. +\paragraph{} Task-oriented dialog systems, both modular and end-to-end systems, are capable of handling a wide range of tasks (such as ticket booking, restaurant booking, etc.) across various domains. A task-oriented dialogue system has stricter requirements for responses because it needs to accurately understand and process the user's message. Therefore, modular methods were suggested as a way to generate responses in a more controlled manner. The architecture of a typical modular-based task-oriented dialog system is depicted in Figure \ref{figure:1}. A typical modular-based system uses a modular pipeline, which has four modules that execute sequentially - Natural Language Understanding (NLU), Dialog State Tracking (DST), Policy Learning (PL), and Natural Language Generation (NLG). The NLU module extracts the semantic values from user messages, together with intent detection and domain classification. The DST module takes the extracted values and fills the slot-value pairs based on the entire dialog history. The Policy Learning (PL) module takes the slot-value pairs and decides the next action to be performed by the dialog system. The NLG module converts the dialog actions received from the PL module into the natural language text, which is usually the system response to the user. +\vspace{0.5cm} +\begin{figure}[h!] + \centering + \includegraphics[width=\linewidth]{images/modular_tod} + \caption{Modular-based task-oriented dialog system \citep{ni2021dlds}} + \label{figure:1} +\end{figure} + +The DST module is essential for enabling the system to comprehend the user's requests by tracking them in the form of slots and values (belief states) at every turn. In recent years, some dialog systems take the user utterances and provide them directly to the DST module. This approach fills the slot-value pairs directly from the raw user message and eliminates the need for NLU module. For example, consider the user message - \textquote{\textit{Plan a train trip to Berlin this Friday for two people}} - the DST module is supposed to fill (\textit{slot, value}) pairs as follows: \{(\textit{destination, Berlin}), (\textit{day, Friday}), (\textit{people, 2})\}. + +\paragraph{} A typical task-oriented dialog system can assist users across multiple domains (restaurant, hotel, train, booking). Each dialog domain contains an ontology, which represents the knowledge of the domain and information required for specific tasks. The ontology of a domain consists of a pre-defined set of slots and all the possible values for each slot. Neural-based models were proposed to solve the DST task by multi-class classification, where the model predicts the correct class from multiple values. This approach depends on the ontology of the domains and needs to track a lot of slot-value pairs. Ontology is hard to obtain in real-world scenarios, especially for new domains. The neural-based DST model also needs a lot of training data which is rarely available for new domains. + +\paragraph{} A dialog state or belief state in DST contains the required information for the system to process the user's request. At each turn of the dialogue, the dialog state can have \textit{informable slots} and \textit{requestable slots}. Informable slots are specified by the user about the preferences and requirements for the system action. For example, in the restaurant domain, the user can ask for a specific type of food or desired price range of the restaurants for table booking. The belief state slots with such information are called \textit{informable slots}. Users can ask the dialog system for an address or phone number of a restaurant, such slots are known as \textit{requestable slots}. This thesis work focuses on the DST module to extract informable slots and their values without depending on the ontology. \subsection{Pre-trained Language Models (PLMs)} -\paragraph{} Large pre-trained language models are trained on huge amounts of textual data and have achieved state-of-the-art performance in a variety of NLP tasks, such as machine translation, text classification, text generation, and summarization. These models are trained on large datasets and are able to learn the probability distribution of the words. Pre-trained language models based on transformer architectures \citep{vaswani2017attention}, such as BERT \citep{devlin2019bert} and GPT \citep{radford2018gpt}, have also achieved state-of-the-art performance on many NLP tasks. GPT-2 \citep{radford2019gpt2} is a transformer-based auto-regressive language model trained on large amounts of open web text data. GPT-2 is trained with a simple objective: predict the next word, given all previous words within some text. The architecture and training objective of the PLMs plays an important role in determining their applicability to particular prompting tasks \citep{liu2021ppp}. For example, left-to-right auto-regressive LMs predict the next word by assigning a probability to the sequence of words. For tasks that require the PLM to generate text from \textit{prefix} prompts (the entire prompt string followed by generated text), the left-to-right LMs tend to mesh well with the left-to-right nature of the language model. +\paragraph{} Large pre-trained language models are trained on huge amounts of textual data and they achieved state-of-the-art performance in a variety of NLP tasks, such as machine translation, text classification, text generation, and summarization. These PLMs trained on large-scale datasets can encode significant linguistic knowledge into their huge amount of parameters. Pre-trained language models based on transformer architectures \citep{vaswani2017attention}, such as BERT \citep{devlin2019bert} and GPT \citep{radford2018gpt}, have also achieved state-of-the-art performance on many NLP tasks. GPT-2 \citep{radford2019gpt2} is a transformer-based left-to-right auto-regressive language model trained on large amounts of open web text data. The main training objective of GPT-2 is to predict the next word, given all the previous words. A left-to-right auto-regressive language model predicts the next word given all the previous words or assigns the probability of word sequences. Consider the sequence of words $x = x_1, x_2, \ldots, x_n$, the probability distribution can be written using the chain rule from left to right: + +$$ +P(x) = P\left(x_1\right) \times P\left(x_2 \mid x_1\right) \times \ldots \times P\left(x_n \mid x_1 \cdots x_{n-1}\right) +$$ + + +\paragraph{} The PLMs trained on large amounts of text can be fine-tuned using task-specific data to solve the downstream tasks efficiently. Previous work by \citet{wu2020tod-bert} pre-trained the BERT model with nine different task-oriented dialog datasets and later used it to fine-tune the downstream tasks. This approach improved the performance of downstream tasks over fine-tuning directly on BERT. \textsc{Soloist} \citep{peng2021soloist} used a similar approach to pre-train the GPT-2 model on two task-oriented dialog corpora and fine-tuned the pre-trained Soloist on the DST task. +The pre-trained \textsc{Soloist} is the baseline model of this thesis, which uses the publicly available 12-layer GPT-2 (117M) model. The prompt-based methods in this thesis also utilize the pre-trained \textsc{Soloist} to fine-tune the prompt-based DST task. -\paragraph{} The baseline model of this thesis, \textsc{Soloist} \citep{peng2021soloist}, uses a 12-layer GPT-2 for building the task-oriented dialog system. \textsc{Soloist} uses the publicly available 117M-parameter GPT-2 as initialization for task-grounded pre-training. The prompt-based methods in this thesis utlize the pre-trained \textsc{Soloist} and fine-tune it to the downstream DST task. +\subsection{SOLOIST Model} \label{subsec:soloist} -\subsection{SOLOIST Model} +\paragraph{} \textsc{Soloist} \citep{peng2021soloist} uses the \textsl{pre-train, fine-tune} paradigm for building a task-oriented dialog system using an auto-regressive language model GPT-2 \citep{radford2019gpt2}. This dialog system is built in two phases: In the pre-training phase, \textsc{Soloist} is initialized with GPT-2 and further trained on two large task-oriented datasets, Schema and Taskmaster. The primary goal at this stage is to learn task completion skills such as \textit{belief prediction} and \textit{response generation}. In the belief predictions task of the pre-training stage, the \textsc{Soloist} model takes dialog history as input and generates belief states as a sequence of words. The generated belief state sequences take the form - \textquote{\textit{belief: $slot_1 = value_1; slot_2 = value_2, \ldots$}}. The pre-training objective for predicting belief states is: -\paragraph{} \textsc{Soloist} \citep{peng2021soloist} is a task-oriented dialog system that uses transfer learning and machine teaching to build task bots at scale. \textsc{Soloist} uses the \textit{pre-train, fine-tune} paradigm for building end-to-end dialog systems using a transformer-based auto-regressive language model GPT-2 \citep{radford2019gpt2}, which subsumes different dialog modules (i.e., NLU, DST, POL, NLG) into a single model. In a \textit{pre-train, fine-tune} paradigm, a fixed \textit{pre-trained} LM is adapted to different downstream tasks by introducing additional parameters and \textit{fine-tuning} them using task-specific objective functions. In the pre-training stage, \textsc{Soloist} is initialized with the 12-layer GPT-2 (117M parameters) and further trained on large heterogeneous dialog corpora. The primary goal at this stage is to learn task completion skills such as belief state prediction (DST) and response generation. In the fine-tuning stage, the pre-trained \textsc{Soloist} model can be used to solve new tasks by just using a handful of task-specific dialogs. +$$ +\mathcal{L}=\log P(b \mid s)=\sum_{t=1}^{T_b} \log P\left(b_t \mid b_{