Added thesis proposal latex files (and latex .gitignore)

main
Pavan Mandava 3 years ago
parent 829f46590a
commit 041e521254

7
.gitignore vendored

@ -158,7 +158,12 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.idea/
workspace.xml
.idea/shelf/
/shelf/
out/
*.iws
# ---> JupyterNotebooks
# gitignore template for Jupyter Notebooks

@ -0,0 +1,23 @@
# LaTeX temporary files
*.aux
*.log
*.toc
# PDF output - usually a bad idea to keep this in Git
*.pdf
# Latexmk
*.fdb_latexmk
# SyncTeX
*.synctex.gz
# LaTeX Beamer
*.snm
*.vrb
*.nav
*.out
# BibTeX
*.bbl
*.blg

Binary file not shown.

After

Width:  |  Height:  |  Size: 70 KiB

Binary file not shown.

@ -0,0 +1,89 @@
\documentclass[a4paper, titlepage, 12pt]{article}
\usepackage[paper=a4paper,left=3cm, right=3cm]{geometry}
\usepackage[utf8]{inputenc}
\usepackage{color}
\usepackage[english]{babel}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{array}
\usepackage[colorlinks=true,linkcolor=blue,citecolor=blue, allcolors=blue]{hyperref}
\usepackage{natbib}
\bibliographystyle{plainnat}
\bibpunct[; ]{(}{)}{;}{a}{,}{;}
\usepackage{graphicx}
\usepackage{makecell}
%%% SET DATA HERE
% Preliminary title of the thesis.
\newcommand{\thesisTitle}{Prompt-based methods for DST}
% Full Name & other details
\newcommand{\name}{Mandava, Sai Pavan}
\newcommand{\matrikNummer}{3461015}
\newcommand{\myEmail}{st169661@stud.uni-stuttgart.de}
% potential start time of the Thesis
\newcommand{\startTime}{July 2022}
% Thesis Supervisor
\newcommand{\supervisor}{Prof. Dr. Thang Vu}
\newcommand{\supervisorEmail}{thang.vu@ims.uni-stuttgart.de}
%% 2nd supervisor/advisor details here
%% TODO command
\newcommand\todo[1]{\colorbox{yellow}{#1}}
% Start document
\begin{document}
\begin{titlepage}
\begin{center}
\begin{figure}[h!]
\centering
\includegraphics[width=.3\linewidth]{images/ims_logo.jpeg}
\end{figure}
\large{Institut f{\"u}r Maschinelle Sprachverarbeitung\\Universit{\"a}t Stuttgart\\Pfaffenwaldring 5b\\70569 Stuttgart}\\[0.3cm]
\vspace{1cm}
\LARGE{Master Thesis Proposal}\\[0.7cm]
\Huge{\textbf{\thesisTitle}}\\ [0.5cm]
\large{\startTime} \\
\vspace{1cm}
\Large{\textbf{\name}} \\[3pt]
\large{M.Sc. Computational Linguistics} \\ [2pt]
\large{Mat.Nr.: \matrikNummer} \\ [2pt]
\normalsize{\myEmail}
\vspace{1cm}
\large{\textbf{Supervisor}}\\
\supervisor\\ [2pt]
\normalsize{\supervisorEmail}\\
\vspace{0.5cm}
\end{center}
\end{titlepage}
\input{sections/01_motivation}
\input{sections/02_related_work}
\input{sections/03_methods}
\input{sections/04_work_plan}
\clearpage
\bibliography{references}
\end{document}

@ -0,0 +1,244 @@
@article{yang2022prompt,
author = {Yuting Yang and
Wenqiang Lei and
Juan Cao and
Jintao Li and
Tat{-}Seng Chua},
title = {Prompt Learning for Few-Shot Dialogue State Tracking},
journal = {CoRR},
volume = {abs/2201.05780},
year = {2022},
url = {https://arxiv.org/abs/2201.05780},
eprinttype = {arXiv},
eprint = {2201.05780},
timestamp = {Mon, 18 Jul 2022 13:17:40 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2201-05780.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{liu2021ppp,
author = {Pengfei Liu and
Weizhe Yuan and
Jinlan Fu and
Zhengbao Jiang and
Hiroaki Hayashi and
Graham Neubig},
title = {Pre-train, Prompt, and Predict: {A} Systematic Survey of Prompting
Methods in Natural Language Processing},
journal = {CoRR},
volume = {abs/2107.13586},
year = {2021},
url = {https://arxiv.org/abs/2107.13586},
eprinttype = {arXiv},
eprint = {2107.13586},
timestamp = {Tue, 03 Aug 2021 14:53:34 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2107-13586.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{brown2020gpt3,
author = {Tom B. Brown and
Benjamin Mann and
Nick Ryder and
Melanie Subbiah and
Jared Kaplan and
Prafulla Dhariwal and
Arvind Neelakantan and
Pranav Shyam and
Girish Sastry and
Amanda Askell and
Sandhini Agarwal and
Ariel Herbert{-}Voss and
Gretchen Krueger and
Tom Henighan and
Rewon Child and
Aditya Ramesh and
Daniel M. Ziegler and
Jeffrey Wu and
Clemens Winter and
Christopher Hesse and
Mark Chen and
Eric Sigler and
Mateusz Litwin and
Scott Gray and
Benjamin Chess and
Jack Clark and
Christopher Berner and
Sam McCandlish and
Alec Radford and
Ilya Sutskever and
Dario Amodei},
title = {Language Models are Few-Shot Learners},
journal = {CoRR},
volume = {abs/2005.14165},
year = {2020},
url = {https://arxiv.org/abs/2005.14165},
eprinttype = {arXiv},
eprint = {2005.14165},
timestamp = {Wed, 03 Jun 2020 11:36:54 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2005-14165.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{madotto2021fsb,
author = {Andrea Madotto and
Zhaojiang Lin and
Genta Indra Winata and
Pascale Fung},
title = {Few-Shot Bot: Prompt-Based Learning for Dialogue Systems},
journal = {CoRR},
volume = {abs/2110.08118},
year = {2021},
url = {https://arxiv.org/abs/2110.08118},
eprinttype = {arXiv},
eprint = {2110.08118},
timestamp = {Fri, 22 Oct 2021 13:33:09 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2110-08118.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{radford2018gpt,
title={Improving language understanding by generative pre-training},
author={Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya and others},
year={2018},
publisher={OpenAI}
}
@article{radford2019gpt2,
title={Language models are unsupervised multitask learners},
author={Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya and others},
journal={OpenAI blog},
volume={1},
number={8},
pages={9},
year={2019}
}
@inproceedings{devlin2019bert,
title = "{BERT}: Pre-training of Deep Bidirectional Transformers for Language Understanding",
author = "Devlin, Jacob and
Chang, Ming-Wei and
Lee, Kenton and
Toutanova, Kristina",
booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
month = jun,
year = "2019",
address = "Minneapolis, Minnesota",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/N19-1423",
doi = "10.18653/v1/N19-1423",
pages = "4171--4186",
abstract = "We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5 (7.7 point absolute improvement), MultiNLI accuracy to 86.7{\%} (4.6{\%} absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).",
}
@article{peng2021soloist,
title = "{SOLOIST:} Building Task Bots at Scale with Transfer Learning and Machine Teaching",
author = "Peng, Baolin and
Li, Chunyuan and
Li, Jinchao and
Shayandeh, Shahin and
Liden, Lars and
Gao, Jianfeng",
journal = "Transactions of the Association for Computational Linguistics",
volume = "9",
year = "2021",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2021.tacl-1.49",
doi = "10.1162/tacl_a_00399",
pages = "807--824",
abstract = "Abstract We present a new method, Soloist,1 that uses transfer learning and machine teaching to build task bots at scale. We parameterize classical modular task-oriented dialog systems using a Transformer-based auto-regressive language model, which subsumes different dialog modules into a single neural model. We pre-train, on heterogeneous dialog corpora, a task-grounded response generation model, which can generate dialog responses grounded in user goals and real-world knowledge for task completion. The pre-trained model can be efficiently adapted to accomplish new tasks with a handful of task-specific dialogs via machine teaching, where training samples are generated by human teachers interacting with the system. Experiments show that (i)Soloist creates new state-of-the-art on well-studied task-oriented dialog benchmarks, including CamRest676 and MultiWOZ; (ii) in the few-shot fine-tuning settings, Soloist significantly outperforms existing methods; and (iii) the use of machine teaching substantially reduces the labeling cost of fine-tuning. The pre-trained models and codes are available at https://aka.ms/soloist.",
}
@article{lee2021sdp,
author = {Chia{-}Hsuan Lee and
Hao Cheng and
Mari Ostendorf},
title = {Dialogue State Tracking with a Language Model using Schema-Driven
Prompting},
journal = {CoRR},
volume = {abs/2109.07506},
year = {2021},
url = {https://arxiv.org/abs/2109.07506},
eprinttype = {arXiv},
eprint = {2109.07506},
timestamp = {Wed, 03 Nov 2021 08:48:34 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2109-07506.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{eric2019multiwoz,
author = {Mihail Eric and
Rahul Goel and
Shachi Paul and
Abhishek Sethi and
Sanchit Agarwal and
Shuyang Gao and
Dilek Hakkani{-}T{\"{u}}r},
title = {MultiWOZ 2.1: Multi-Domain Dialogue State Corrections and State Tracking
Baselines},
journal = {CoRR},
volume = {abs/1907.01669},
year = {2019},
url = {http://arxiv.org/abs/1907.01669},
eprinttype = {arXiv},
eprint = {1907.01669},
timestamp = {Fri, 09 Aug 2019 10:00:01 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1907-01669.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{budzianowski2018multiwoz,
title = "{M}ulti{WOZ} - A Large-Scale Multi-Domain {W}izard-of-{O}z Dataset for Task-Oriented Dialogue Modelling",
author = "Budzianowski, Pawe{\l} and
Wen, Tsung-Hsien and
Tseng, Bo-Hsiang and
Casanueva, I{\~n}igo and
Ultes, Stefan and
Ramadan, Osman and
Ga{\v{s}}i{\'c}, Milica",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
month = oct # "-" # nov,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D18-1547",
doi = "10.18653/v1/D18-1547",
pages = "5016--5026",
abstract = "Even though machine learning has become the major scene in dialogue research community, the real breakthrough has been blocked by the scale of data available.To address this fundamental obstacle, we introduce the Multi-Domain Wizard-of-Oz dataset (MultiWOZ), a fully-labeled collection of human-human written conversations spanning over multiple domains and topics.At a size of 10k dialogues, it is at least one order of magnitude larger than all previous annotated task-oriented corpora.The contribution of this work apart from the open-sourced dataset is two-fold:firstly, a detailed description of the data collection procedure along with a summary of data structure and analysis is provided. The proposed data-collection pipeline is entirely based on crowd-sourcing without the need of hiring professional annotators;secondly, a set of benchmark results of belief tracking, dialogue act and response generation is reported, which shows the usability of the data and sets a baseline for future studies.",
}
@inproceedings{min2020dsi,
title = {Dialogue State Induction Using Neural Latent Variable Models},
author = {Min, Qingkai and Qin, Libo and Teng, Zhiyang and Liu, Xiao and Zhang, Yue},
booktitle = {Proceedings of the Twenty-Ninth International Joint Conference on
Artificial Intelligence, {IJCAI-20}},
publisher = {International Joint Conferences on Artificial Intelligence Organization},
editor = {Christian Bessiere},
pages = {3845--3852},
year = {2020},
month = {7},
note = {Main track},
doi = {10.24963/ijcai.2020/532},
url = {https://doi.org/10.24963/ijcai.2020/532},
}
@inproceedings{gao2021lmbff,
title = "Making Pre-trained Language Models Better Few-shot Learners",
author = "Gao, Tianyu and
Fisch, Adam and
Chen, Danqi",
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.acl-long.295",
doi = "10.18653/v1/2021.acl-long.295",
pages = "3816--3830",
abstract = "The recent GPT-3 model (Brown et al., 2020) achieves remarkable few-shot performance solely by leveraging a natural-language prompt and a few task demonstrations as input context. Inspired by their findings, we study few-shot learning in a more practical scenario, where we use smaller language models for which fine-tuning is computationally efficient. We present LM-BFF{---}better few-shot fine-tuning of language models{---}a suite of simple and complementary techniques for fine-tuning language models on a small number of annotated examples. Our approach includes (1) prompt-based fine-tuning together with a novel pipeline for automating prompt generation; and (2) a refined strategy for dynamically and selectively incorporating demonstrations into each context. Finally, we present a systematic evaluation for analyzing few-shot performance on a range of NLP tasks, including classification and regression. Our experiments demonstrate that our methods combine to dramatically outperform standard fine-tuning procedures in this low resource setting, achieving up to 30{\%} absolute improvement, and 11{\%} on average across all tasks. Our approach makes minimal assumptions on task resources and domain expertise, and hence constitutes a strong task-agnostic method for few-shot learning.",
}
@inproceedings{cui2021template,
title = "Template-Based Named Entity Recognition Using {BART}",
author = "Cui, Leyang and
Wu, Yu and
Liu, Jian and
Yang, Sen and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.findings-acl.161",
doi = "10.18653/v1/2021.findings-acl.161",
pages = "1835--1845",
}

@ -0,0 +1,7 @@
\section{Motivation}
\paragraph{} Dialog State Tracking (DST) is an essential module in dialog systems, it tracks the user goals in the form of dialog states given the entire dialog history. In dialog systems, ``dialog states'' - sometimes also called ``belief states'' contains a set of \textit{(slot, value)} pairs for each turn of the dialog history. Existing data-driven methods and neural models for individual dialog modules (NLU, DST, NLG) and end-to-end dialog systems show promising results, but they need large amounts of task-specific training data, which is rarely available for new tasks. For task-specific DST, collecting dialog state labels can also be costly, requiring domain experts to indicate all possible \textit{(slot, value)} pairs for each turn of the dialogues. A typical task-oriented dialog system contains an ontology for each domain, with a pre-defined set of slots and all possible values for each domain. In real-world applications, defining all possible slots and values for DST is difficult due to new rising domains and users' continuous needs.
\paragraph{} Prompt-based learning \textit{(``pre-train, prompt, and predict'')} is a new paradigm in NLP which aims to predict the probability of text directly from the pre-trained LM \citep{liu2021ppp}. This framework is powerful as it allows the language model to be \textit{pre-trained} on massive amounts of raw text, and by defining a new prompting function the model can perform \textit{few-shot} or even \textit{zero-shot} learning. The large pre-trained language models (PLMs) are supposed to be useful in \textit{few-shot} scenarios where task-related training data is limited, as they can be ``probed'' for task-related knowledge efficiently by using a prompt. One example of such large pre-trained language models is GPT-3 \citep{brown2020gpt3} - \textit{``Language Models are Few-Shot Learners''}. \citet{madotto2021fsb} created an end-to-end chatbot (Few-Shot Bot) using \textit{prompt-based few-shot learning} (no gradient fine-tuning) and achieved comparable results to those of state-of-the-art. Prompt-based learning for few-shot DST with limited labeled domains is still under-explored.
\paragraph{} Recently, \citet{yang2022prompt} proposed a new prompt learning framework for few-shot DST. This work designed a \textit{value-based prompt} and an \textit{inverse prompt} mechanism to efficiently train a DST model for domains with limited training data. This approach doesn't depend on the ontology of slots and the results show that it can generate unseen slots and outperforms the existing state-of-the-art few-shot methods. The goal of this thesis is to further explore this prompt-based few-shot learning framework for DST by implementing these three tasks: (1) Prompt learning framework for few-shot DST - reproduce the results from \citet{yang2022prompt}. Can the DST knowledge be probed from PLM? (2) Evaluation and analyses of belief state predictions. This task will answer what improvements can be observed from prompt-based methods and the drawbacks of this approach. (3) Extend this prompt-based DST framework to utilize various \textit{multi-prompt} learning methods. Can different \textit{multi-prompt} techniques help the PLM better understand the DST task? These research methods are formally described in the later sections of this proposal.

@ -0,0 +1,29 @@
\section{Background \& Related Work}
\subsection{Dialog State Tracking (DST)}
\paragraph{} Task-oriented dialog systems, both modular and end-to-end systems, solve a wide range of tasks (ticket booking, restaurant booking, etc.) across different domains. Since task-oriented dialog systems require strict response constraints as they aim to accurately handle the user messages, modular systems were proposed to generate responses in a controllable way. A typical modular-based system uses a modular pipeline, which has four modules that execute sequentially - Natural Language Understanding (NLU), Dialog State Tracking (DST), Policy Learning (POL), and Natural Language Generation (NLG). In this thesis, the focus is on the DST module of the modular-based dialog system. The Dialog state tracker infers the belief states (or user goals) from every turn of the dialog history and provides this information to the next module. For example, consider the user message - \textit{``Plan a train trip to Berlin this Friday''} - the DST Module is supposed to extract belief states (\textit{slot, value}) pairs as follows: \{(\textit{destination, Berlin}), (\textit{day, this Friday})\}.
\subsection{Pre-trained Language Models (PLMs)}
\paragraph{} Large pre-trained language models are trained on huge amounts of textual data and are used to solve a variety of NLP tasks. Pre-trained transformer-based language models such as BERT \citep{devlin2019bert} and GPT \citep{radford2018gpt} have achieved state-of-the-art performance on many tasks. GPT-2 \citep{radford2019gpt2} is a state-of-the-art auto-regressive language model trained on large amounts of open web text data. GPT-2 is trained with a simple objective to predict the next word, given all previous words within some text. The training objective of the pre-trained LMs plays an important role in determining its applicability to particular prompting tasks \citep{liu2021ppp}. For example, left-to-right auto-regressive LMs may be particularly suitable for \textit{prefix} prompts.
\paragraph{} The baseline model of this thesis \textsc{Soloist} \citep{peng2021soloist} uses a 12-layer GPT-2. \textsc{Soloist} uses the publicly available 117M-parameter GPT-2 as initialization for task-grounded pre-training. The prompt learning model of this thesis will use the \textsc{Soloist} to learn the DST task.
\subsection{SOLOIST}
\paragraph{} \textsc{Soloist} \citep{peng2021soloist} is a task-oriented dialog system that uses transfer learning and machine teaching to build task bots at scale. \textsc{Soloist} uses the \textit{pre-train, fine-tune} paradigm for building end-to-end dialog systems using a transformer-based auto-regressive language model GPT-2 \citep{radford2019gpt2}, which subsumes different dialog modules (i.e., NLU, DST, POL, NLG) into a single model. In a \textit{pre-train, fine-tune} paradigm, a fixed \textit{pre-trained} LM is adapted to different downstream tasks by introducing additional parameters and \textit{fine-tuning} them using task-specific objective functions. In the pre-training stage, \textsc{Soloist} is initialized with the 12-layer GPT-2 (117M parameters) and further trained on large heterogeneous dialog corpora. The primary goal at this stage is to learn task completion skills such as DST and POL. Belief state prediction (DST) is one of the tasks in the task-grounded pre-training, which will be utilized in this thesis. In the fine-tuning stage, the pre-trained \textsc{Soloist} model can be used to solve new tasks by just using a handful of task-specific dialogs.
\paragraph{} In this thesis, the pre-trained \textsc{Soloist} will be utilized in the baseline model. In the fine-tuning stage, a multi-domain task-oriented dialog dataset will be applied to solve the belief state prediction task. The predictions and results from this task can be used to compare with the prompt-based model for detailed analyses.
\subsection{\textit{Pre-train, Prompt, and Predict (PPP)} Paradigm}
\paragraph{} Prompt-based learning (also dubbed \textit{``pre-train, prompt, and predict''}) is a new paradigm that aims to utilize PLMs more efficiently to solve downstream NLP tasks \citep{liu2021ppp}. In this paradigm, instead of adapting pre-trained LMs to downstream tasks via objective engineering, downstream tasks are reformulated to look more like those solved during the original LM training with the help of a textual \textit{prompt}. To perform prediction tasks, the original input $x$ is modified using a \textit{template} into a textual \textit{prompt} $x^{\prime}$ that has some unfilled slots, and then the PLM is used to probabilistically fill the unfilled information to obtain a final string $\hat{x}$, from which the final output $y$ can be derived.
\paragraph{} For example, to recognize the emotion in the text, where \textit{input} $x = $``I missed the bus today.'', the \textit{template} may take the form such as ``$[X]$ I felt so $[Z]$''. Then, \textit{prompt} $x^{\prime}$ would become ``I missed the bus today. I felt so $[Z]$'' and ask the PLM to fill the slot $[Z]$ with an emotion-bearing word. There are two main varieties of prompts: \textit{cloze prompts}, where the slot $[Z]$ is to be filled in the middle of the text, and \textit{prefix prompts}, where the input text comes entirely before $[Z]$. In general, for tasks that are being solved using a standard auto-regressive LM, prefix prompts tend to be more helpful, as they mesh well with the left-to-right nature of the model.
\paragraph{} In this way, by selecting the appropriate prompts, the pre-trained LM can be used to predict the desired output, sometimes even without any additional task-specific training. In this thesis, prompt-based methods will be utilized to train and help PLM understand the DST task.
\subsection{Prompt learning for DST}
\paragraph{} Existing work by \citet{lee2021sdp} uses slots as prompts, along with the natural language descriptions of the schema for generating corresponding values. This slot-based prompt DST approach uses encoder-decoder LM with a bi-directional encoder. This method relies on the known ontology of the slots and requires a lot of training data for fine-tuning PLM. In real-world applications, defining all possible slots is difficult due to new domains and users' continuous needs. \citet{yang2022prompt} proposed a new prompt learning framework that uses values as prompts and doesn't rely on the ontology of the slots. This thesis will apply the value-based prompt approach for few-shot DST.

@ -0,0 +1,82 @@
\section{Methods}
The main goal of thesis is to explore the prompt learning framework for few-shot DST designed by \citet{yang2022prompt} and propose some improvements. This thesis work can be subdivided into three tasks: (1) Apply prompt learning framework for few-shot DST, (2) Evaluation and analyses of belief-state predictions, (3) Multi-prompt learning methods.
\subsection{Prompt learning framework for few-shot DST} \label{task1}
\paragraph{} This task aims to reproduce the results from \citet{yang2022prompt} and apply some minor improvements by utilizing multi-prompt methods. There's no publicly available implementation of this prompt learning framework. This task implements the prompt learning framework using \textsc{Soloist} baseline.
\paragraph{Dataset} The baseline and prompt-based methods are evaluated on MultiWOZ 2.1 \citep{eric2019multiwoz} dataset. MultiWOZ 2.0, originally released by \citet{budzianowski2018multiwoz}, is a fully-labeled dataset with a collection of human-human written conversations spanning over multiple domains and topics. \citet{eric2019multiwoz} added some fixes and improvements to dialogue utterances and released MultiWOZ 2.1, which contains 8438/1000/1000 dialogues for training/validation/testing respectively. \citet{yang2022prompt} excluded two domains that only appear in the training set. Under few-shot settings, only a portion of the training data will be utilized to observe the performance in a low-resource scenario.
\paragraph{SOLOIST Baseline} \textsc{Soloist} \citep{peng2021soloist} is the baseline for the prompt-based approach. \textsc{Soloist} is initialized with the 12-layer GPT-2 \citep{radford2019gpt2} and further trained on two task-oriented dialog corpora (Schema and Taskmaster). The task-grounded pre-training helps \textsc{Soloist} to solve two dialog-related tasks: \textit{belief state prediction} and \textit{response generation}. For the baseline implementation, the pre-trained \textsc{Soloist} will be fine-tuned on MultiWOZ 2.1 dataset and perform the belief predictions task for DST. The main focus of this thesis is on prompt-based methods, however, the \textsc{Soloist} baseline implementation is required for comparing the belief state predictions and performance of prompt learning.
\paragraph{Value-based Prompt} A general idea for generating (\textit{slot, value}) pairs is to use slots in the prompts and generate corresponding values \citep{lee2021sdp}. For example, given the utterance - ``\textit{Plan a trip to Berlin}'' and slot (\textit{destination}), the prompt to the PLM could become ``\textit{Plan a trip to Berlin. destination = [z]}'' and the PLM is expected to generate \textit{[z]} as ``\textit{Berlin}''. However, this approach relies on the ontology of the slots, and the fixed set of slots can change in real-world applications. \citet{yang2022prompt} proposed a \textit{value-based prompt} that uses values in the prompt and generates corresponding slots. This method doesn't require any pre-defined set of slots and can also generate unseen slots. Consider this prompt template: ``\textit{belief states: value = [v], slot = [s]}'', the prompt function $f$ can be of form $f(v) = $ \textit{[utterances] belief states: value = [v], slot = [s]}, given the value candidate $v = $ ``\textit{London}'', the PLM should be able to generate \textit{slot [s] = ``destination''}. The overall training objective of value-based prompt generation is maximizing the log-likelihood of slots in the training dataset $D$:
$$\mathcal{L}=\sum_{t}^{|D|} \log P\left(s_{t} \mid c_{t}, f\left(v_{t}\right)\right)$$
where $P\left(s_{t} \mid c_{t}, f\left(v_{t}\right)\right)$ is the probability of slot $s_t$ given dialog history $c_t$ and prompt-function $f$ filled with value $v_t$ for each turn $t$.
The loss $\mathcal{L}$ from this step will be combined with the loss from the next step in order to compute the final loss. While training, the values from the annotated training dataset are utilized to construct prompts.
\paragraph{Inverse Prompt} The \textit{inverse prompt} mechanism is used to generate values by prompting slots. After generating slot $s$ using value-based prompts (previous step), the generated slot $s$ is presented to the inverse prompt function $I$. The inverse prompt aims to generate the value $v^{\prime}$ which is supposed to be close to original value $v$. The template for inverse prompt function is $I = $ ``\textit{belief states: slot = [s], value = [v]}''. This inverse prompt can be considered as an auxiliary task for this prompt-based approach, which can improve the performance by helping the PLM understand the task and tune the slot generation process. The loss function $\tilde{\mathcal{L}}$ for the inverse prompt mechanism:
$$\tilde{\mathcal{L}}=\sum_{t}^{|D|} \log P\left(v^{\prime}_{t} \mid c_{t}, I\left(s_{t}\right)\right)$$
The final loss $\mathcal{L}^{*}$ can be computed by combining loss from value-based prompt $\mathcal{L}$ and inverse prompt loss $\tilde{\mathcal{L}}$:
$$ \mathcal{L}^{*} = \mathcal{L} + w *\tilde{\mathcal{L}} $$
where $w$ is a decimal value (0,1) and can be used to adjust the influence of inverse prompt.
\paragraph{Training} For training the above prompt-based approach, \textsc{Soloist} (117M params) pre-trained model will be utilized and fine-tuned on the prompt-based slot generation process. To fine-tune \textsc{Soloist}, MultiWOZ 2.1 dataset is used, the dialog history and values are directly given to the prompts. Inverse prompt is only used during the training phase. For evaluating the prompt-based model ability to generate slots under low-resource data settings, few-shot experiments will be performed while training. Experiments will be conducted by choosing random samples of the training data (1\%, 5\%, 10\%, and 25\%) for each domain. Few-shot experiments will be performed on both \textsc{Soloist} baseline and the prompt-based model.
\paragraph{Testing} In the testing phase, only value-based prompts are utilized for slot generation. While testing, candidate values are not known. Following the existing work \citep{min2020dsi}, values can be extracted directly from the utterance. First POS tags, named entities, and co-references are extracted. A set of rules can be used to extract candidate values given the POS and entity patterns like considering adjectives and adverbs, filtering stop words and repeated candidates.
\subsection{Evaluation \& Analyses} \label{task2}
\paragraph{Evaluation Metrics} The standard metric joint goal accuracy (JGA) will be adopted to evaluate the belief state predictions. This metric compares all the predicted belief states to the ground-truth states for each turn. The prediction is correct only if all the predicted states match the ground-truth states. Both slots and values must match for the prediction to be correct. To omit the influence of value extraction, \citet{yang2022prompt} proposed JGA*, the accuracy is computed only for the belief states where the values are correctly identified. These evaluation metrics can answer the following questions: \textbf{Q1}: How do the prompt-based methods perform overall compared to SoTA \textsc{Soloist}? \textbf{Q2}: Can the prompt-based model perform better under the few-shot settings? \textbf{Q3}: Does JGA* has a better score than JGA?
\paragraph{Analyses of belief state predictions} The main goal of this task is to analyze belief state predictions. The predictions from \textsc{Soloist} baseline and prompt-based methods will be compared and analyzed to identify the improvements and drawbacks. A detailed error analyses will be performed on the wrong belief state predictions.
\subsection{Multi-prompt learning methods} \label{task3}
The \textit{value-based} prompt described in the previous sections utilize a \textit{single} prompt for making predictions. However, a significant body of research has demonstrated that the use of multiple prompts can further improve the efficacy of prompting methods \citep{liu2021ppp}. There are different ways to extend the single prompt learning to use multiple prompts. This task will explore three multi-prompt learning methods: \textit{Prompt ensembling}, \textit{Prompt augmentation}, and \textit{Prompt decomposition}. This task aims to answer the following questions - \textbf{Q1}: Can combining different \textit{multi-prompt} techniques help the PLM better understand the DST task? \textbf{Q2}: How do various hand-crafted prompt functions influence the prompt-based model?
\paragraph{Prompt Ensembling} This method uses multiple \textit{unanswered} prompts during the inference time to make predictions \citep{liu2021ppp}. This idea can leverage the complementary advantages of different prompts and stabilize the performance on downstream tasks. \citet{yang2022prompt} applied prompt ensembling for the value-based prompt to effectively utilize four different prompts. A simple way for ensembling is to train a separate model for each prompt and generate the output by applying the weighted averaging on slot generation probability. The probability of slot $s_t$ can be calculated via:
$$
P\left(s_{t} \mid c_{t}\right)=\sum_{k}^{|K|} \alpha_{k} * P\left(s_{t} \mid c_{t}, f_{k}\left(v_{t}\right)\right)
$$
where $|K|$ represents the number of prompt functions, $f_{k}$ is the $k$-th prompt function, $\alpha_{k}$ is the weight of prompt $k$. This task will utilize prompt ensembling differently from \citet{yang2022prompt}, by combining other multi-prompt methods. In this task, experiments will be performed on various prompt templates to find the most effective and suitable prompts in combination with other multi-prompt methods.
\begin{table}[h!]
\centering
\begin{tabular}{ c l }
$f_{1}$ & belief states: value = [v], slot = [s]\\
$f_{2}$ & belief states: [v] = [s]\\
$f_{2}$ & [v] is of slot type [s]\\
$f_{4}$ & [v] is the value of [s]\\
\vdots &
\end{tabular}
\caption{Examples of different prompt functions for ensembling}
\label{table:1}
\end{table}
\paragraph{Prompt Augmentation} \textit{Prompt Augmentation}, sometimes called \textit{demonstration learning} \citep{gao2021lmbff}, provides a few additional \textit{answered prompts} that can demonstrate to the PLM, how the actual prompt slot can be answered. Sample selection of answered prompts will be manually hand-picked from the training data. Experiments will be conducted on different sets of samples. Table \ref{table:2} provides an example for prompt augmentation.
\begin{table}[h!]
\centering
\begin{tabular}{ r l }
I want to book a cheap hotel. & \textit{cheap} is of slot \textit{price range}\\
Plan a train trip to Berlin. & \textit{Berlin} is of slot \textit{destination}\\
Find me an Italian Restaurant. & \textit{Italin} is of slot \textit{food}\\
Recommend a movie at Cinemaxx. & \textit{Cinemaxx} is of slot [s]
\end{tabular}
\caption{Examples of prompt augmentation with answered prompts}
\label{table:2}
\end{table}
\paragraph{Prompt Decomposition} For utterances where multiple slot values should be predicted, directly using a single prompt for generating multiple slots is challenging. One intuitive method is to breakdown the prompt into sub-prompts, and generate the slots for each sub-prompt separately. For each candidate value in the utterance, construct a \textit{value-based} prompt and generate the slot. This approach will be utilized in both training and testing phases. This sort of \textit{prompt decomposition} has been explored by \citet{cui2021template} for named entity recognition(NER) task.
\begin{table}[h!]
\centering
\begin{tabular}{ c l }
Utterance: & Book a flight to Stuttgart tomorrow evening.\\
Prompt 1: & belief states: \textit{Stuttgart} = [s]\\
Prompt 2: & belief states: \textit{tomorrow} = [s]\\
Prompt 3: & belief states: \textit{evening} = [s]\\
\end{tabular}
\caption{Prompt decomposition example}
\label{table:3}
\end{table}

@ -0,0 +1,37 @@
\section{Work Plan}
This section outlines the work plan for the thesis. Main tasks, sub-tasks and deadlines are provided in the below table \ref{table:4}.
\begin{table}[h!]
\centering
\begingroup
\setlength{\tabcolsep}{12pt} % Default value: 6pt
\renewcommand{\arraystretch}{1.9} % Default value: 1
\begin{tabular}{ |c|c|c| }
\hline
\textbf{Main Tasks} & \textbf{Sub-tasks} & \textbf{Deadlines} \\
\hline
\makecell{Baseline Implementation\\ (Section \ref{task1})} & \makecell{Environment Setup\\ \textsc{Soloist} Baseline} & \makecell{22.08.2022 \\(3 weeks)} \\
\hline
\makecell{Prompt Learning\\ (Section \ref{task1})} & \makecell{Value-based Prompt\\ Inverse Prompt Mechanism \\Training \& Testing} & \makecell{19.09.2022 \\(4 weeks)} \\
\hline
\makecell{Evaluation \& Analyses\\ (Section \ref{task2})} & \makecell{Evaluation Metrics\\ Graphs \& Visualizations\\ Prediction Analysis \\Error Analysis} & \makecell{10.10.2022 \\(3 weeks)} \\
\hline
\makecell{Multi-prompt Learning\\ (Section \ref{task3})} & \makecell{Prompt Ensembling\\ Prompt Augmentation\\ Prompt Decomposition} & \makecell{07.11.2022 \\(4 weeks)} \\
\hline
\makecell{Thesis Writing \\(First draft)} & \LaTeX\ Typesetting & \makecell{19.12.2022 \\(6 weeks)} \\
\hline
\makecell{Thesis Writing \\(Final submission)} & \makecell{\LaTeX\ Typesetting \\Corrections \& Fixes} & \makecell{16.01.2023 \\(4 weeks)} \\
\hline
\end{tabular}
\endgroup
\caption{Thesi work plan with deadlines}
\label{table:4}
\end{table}
\textbf{Other important dates:}
\begin{itemize}
\item Thesis Start Date: \textbf{XX.08.2022}
\item Proposal Submission: \textbf{XX.07.2022}
\item Thesis End Date: \textbf{XX.01.2023}
\end{itemize}
Loading…
Cancel
Save