parent
d4ec69ea2d
commit
7472e5ccba
@ -0,0 +1,326 @@
|
|||||||
|
@article{yang2022prompt,
|
||||||
|
author = {Yuting Yang and
|
||||||
|
Wenqiang Lei and
|
||||||
|
Juan Cao and
|
||||||
|
Jintao Li and
|
||||||
|
Tat{-}Seng Chua},
|
||||||
|
title = {Prompt Learning for Few-Shot Dialogue State Tracking},
|
||||||
|
journal = {CoRR},
|
||||||
|
volume = {abs/2201.05780},
|
||||||
|
year = {2022},
|
||||||
|
url = {https://arxiv.org/abs/2201.05780},
|
||||||
|
eprinttype = {arXiv},
|
||||||
|
eprint = {2201.05780},
|
||||||
|
timestamp = {Mon, 18 Jul 2022 13:17:40 +0200},
|
||||||
|
biburl = {https://dblp.org/rec/journals/corr/abs-2201-05780.bib},
|
||||||
|
bibsource = {dblp computer science bibliography, https://dblp.org}
|
||||||
|
}
|
||||||
|
@article{liu2021ppp,
|
||||||
|
author = {Pengfei Liu and
|
||||||
|
Weizhe Yuan and
|
||||||
|
Jinlan Fu and
|
||||||
|
Zhengbao Jiang and
|
||||||
|
Hiroaki Hayashi and
|
||||||
|
Graham Neubig},
|
||||||
|
title = {Pre-train, Prompt, and Predict: {A} Systematic Survey of Prompting
|
||||||
|
Methods in Natural Language Processing},
|
||||||
|
journal = {CoRR},
|
||||||
|
volume = {abs/2107.13586},
|
||||||
|
year = {2021},
|
||||||
|
url = {https://arxiv.org/abs/2107.13586},
|
||||||
|
eprinttype = {arXiv},
|
||||||
|
eprint = {2107.13586},
|
||||||
|
timestamp = {Tue, 03 Aug 2021 14:53:34 +0200},
|
||||||
|
biburl = {https://dblp.org/rec/journals/corr/abs-2107-13586.bib},
|
||||||
|
bibsource = {dblp computer science bibliography, https://dblp.org}
|
||||||
|
}
|
||||||
|
@inproceedings{brown2020gpt3,
|
||||||
|
author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
|
||||||
|
booktitle = {Advances in Neural Information Processing Systems},
|
||||||
|
editor = {H. Larochelle and M. Ranzato and R. Hadsell and M.F. Balcan and H. Lin},
|
||||||
|
pages = {1877--1901},
|
||||||
|
publisher = {Curran Associates, Inc.},
|
||||||
|
title = {Language Models are Few-Shot Learners},
|
||||||
|
url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
|
||||||
|
volume = {33},
|
||||||
|
year = {2020}
|
||||||
|
}
|
||||||
|
@article{madotto2021fsb,
|
||||||
|
author = {Andrea Madotto and
|
||||||
|
Zhaojiang Lin and
|
||||||
|
Genta Indra Winata and
|
||||||
|
Pascale Fung},
|
||||||
|
title = {Few-Shot Bot: Prompt-Based Learning for Dialogue Systems},
|
||||||
|
journal = {CoRR},
|
||||||
|
volume = {abs/2110.08118},
|
||||||
|
year = {2021},
|
||||||
|
url = {https://arxiv.org/abs/2110.08118},
|
||||||
|
eprinttype = {arXiv},
|
||||||
|
eprint = {2110.08118},
|
||||||
|
timestamp = {Fri, 22 Oct 2021 13:33:09 +0200},
|
||||||
|
biburl = {https://dblp.org/rec/journals/corr/abs-2110-08118.bib},
|
||||||
|
bibsource = {dblp computer science bibliography, https://dblp.org}
|
||||||
|
}
|
||||||
|
@article{radford2018gpt,
|
||||||
|
title={Improving language understanding by generative pre-training},
|
||||||
|
author={Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya and others},
|
||||||
|
year={2018},
|
||||||
|
publisher={OpenAI}
|
||||||
|
}
|
||||||
|
@article{radford2019gpt2,
|
||||||
|
title={Language models are unsupervised multitask learners},
|
||||||
|
author={Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya and others},
|
||||||
|
journal={OpenAI blog},
|
||||||
|
volume={1},
|
||||||
|
number={8},
|
||||||
|
pages={9},
|
||||||
|
year={2019}
|
||||||
|
}
|
||||||
|
@inproceedings{devlin2019bert,
|
||||||
|
title = "{BERT}: Pre-training of Deep Bidirectional Transformers for Language Understanding",
|
||||||
|
author = "Devlin, Jacob and
|
||||||
|
Chang, Ming-Wei and
|
||||||
|
Lee, Kenton and
|
||||||
|
Toutanova, Kristina",
|
||||||
|
booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
|
||||||
|
month = jun,
|
||||||
|
year = "2019",
|
||||||
|
address = "Minneapolis, Minnesota",
|
||||||
|
publisher = "Association for Computational Linguistics",
|
||||||
|
url = "https://aclanthology.org/N19-1423",
|
||||||
|
doi = "10.18653/v1/N19-1423",
|
||||||
|
pages = "4171--4186",
|
||||||
|
abstract = "We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5 (7.7 point absolute improvement), MultiNLI accuracy to 86.7{\%} (4.6{\%} absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).",
|
||||||
|
}
|
||||||
|
@article{peng2021soloist,
|
||||||
|
title = "{SOLOIST:} Building Task Bots at Scale with Transfer Learning and Machine Teaching",
|
||||||
|
author = "Peng, Baolin and
|
||||||
|
Li, Chunyuan and
|
||||||
|
Li, Jinchao and
|
||||||
|
Shayandeh, Shahin and
|
||||||
|
Liden, Lars and
|
||||||
|
Gao, Jianfeng",
|
||||||
|
journal = "Transactions of the Association for Computational Linguistics",
|
||||||
|
volume = "9",
|
||||||
|
year = "2021",
|
||||||
|
address = "Cambridge, MA",
|
||||||
|
publisher = "MIT Press",
|
||||||
|
url = "https://aclanthology.org/2021.tacl-1.49",
|
||||||
|
doi = "10.1162/tacl_a_00399",
|
||||||
|
pages = "807--824",
|
||||||
|
abstract = "Abstract We present a new method, Soloist,1 that uses transfer learning and machine teaching to build task bots at scale. We parameterize classical modular task-oriented dialog systems using a Transformer-based auto-regressive language model, which subsumes different dialog modules into a single neural model. We pre-train, on heterogeneous dialog corpora, a task-grounded response generation model, which can generate dialog responses grounded in user goals and real-world knowledge for task completion. The pre-trained model can be efficiently adapted to accomplish new tasks with a handful of task-specific dialogs via machine teaching, where training samples are generated by human teachers interacting with the system. Experiments show that (i)Soloist creates new state-of-the-art on well-studied task-oriented dialog benchmarks, including CamRest676 and MultiWOZ; (ii) in the few-shot fine-tuning settings, Soloist significantly outperforms existing methods; and (iii) the use of machine teaching substantially reduces the labeling cost of fine-tuning. The pre-trained models and codes are available at https://aka.ms/soloist.",
|
||||||
|
}
|
||||||
|
@article{lee2021sdp,
|
||||||
|
author = {Chia{-}Hsuan Lee and
|
||||||
|
Hao Cheng and
|
||||||
|
Mari Ostendorf},
|
||||||
|
title = {Dialogue State Tracking with a Language Model using Schema-Driven
|
||||||
|
Prompting},
|
||||||
|
journal = {CoRR},
|
||||||
|
volume = {abs/2109.07506},
|
||||||
|
year = {2021},
|
||||||
|
url = {https://arxiv.org/abs/2109.07506},
|
||||||
|
eprinttype = {arXiv},
|
||||||
|
eprint = {2109.07506},
|
||||||
|
timestamp = {Wed, 03 Nov 2021 08:48:34 +0100},
|
||||||
|
biburl = {https://dblp.org/rec/journals/corr/abs-2109-07506.bib},
|
||||||
|
bibsource = {dblp computer science bibliography, https://dblp.org}
|
||||||
|
}
|
||||||
|
@article{eric2019multiwoz,
|
||||||
|
author = {Mihail Eric and
|
||||||
|
Rahul Goel and
|
||||||
|
Shachi Paul and
|
||||||
|
Abhishek Sethi and
|
||||||
|
Sanchit Agarwal and
|
||||||
|
Shuyang Gao and
|
||||||
|
Dilek Hakkani{-}T{\"{u}}r},
|
||||||
|
title = {MultiWOZ 2.1: Multi-Domain Dialogue State Corrections and State Tracking Baselines},
|
||||||
|
journal = {CoRR},
|
||||||
|
volume = {abs/1907.01669},
|
||||||
|
year = {2019},
|
||||||
|
url = {http://arxiv.org/abs/1907.01669},
|
||||||
|
eprinttype = {arXiv},
|
||||||
|
eprint = {1907.01669},
|
||||||
|
timestamp = {Fri, 09 Aug 2019 10:00:01 +0200},
|
||||||
|
biburl = {https://dblp.org/rec/journals/corr/abs-1907-01669.bib},
|
||||||
|
bibsource = {dblp computer science bibliography, https://dblp.org}
|
||||||
|
}
|
||||||
|
@inproceedings{budzianowski2018multiwoz,
|
||||||
|
title = "{M}ulti{WOZ} - A Large-Scale Multi-Domain {W}izard-of-{O}z Dataset for Task-Oriented Dialogue Modelling",
|
||||||
|
author = "Budzianowski, Pawe{\l} and
|
||||||
|
Wen, Tsung-Hsien and
|
||||||
|
Tseng, Bo-Hsiang and
|
||||||
|
Casanueva, I{\~n}igo and
|
||||||
|
Ultes, Stefan and
|
||||||
|
Ramadan, Osman and
|
||||||
|
Ga{\v{s}}i{\'c}, Milica",
|
||||||
|
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
|
||||||
|
month = oct # "-" # nov,
|
||||||
|
year = "2018",
|
||||||
|
address = "Brussels, Belgium",
|
||||||
|
publisher = "Association for Computational Linguistics",
|
||||||
|
url = "https://aclanthology.org/D18-1547",
|
||||||
|
doi = "10.18653/v1/D18-1547",
|
||||||
|
pages = "5016--5026",
|
||||||
|
abstract = "Even though machine learning has become the major scene in dialogue research community, the real breakthrough has been blocked by the scale of data available.To address this fundamental obstacle, we introduce the Multi-Domain Wizard-of-Oz dataset (MultiWOZ), a fully-labeled collection of human-human written conversations spanning over multiple domains and topics.At a size of 10k dialogues, it is at least one order of magnitude larger than all previous annotated task-oriented corpora.The contribution of this work apart from the open-sourced dataset is two-fold:firstly, a detailed description of the data collection procedure along with a summary of data structure and analysis is provided. The proposed data-collection pipeline is entirely based on crowd-sourcing without the need of hiring professional annotators;secondly, a set of benchmark results of belief tracking, dialogue act and response generation is reported, which shows the usability of the data and sets a baseline for future studies.",
|
||||||
|
}
|
||||||
|
@inproceedings{min2020dsi,
|
||||||
|
title = {Dialogue State Induction Using Neural Latent Variable Models},
|
||||||
|
author = {Min, Qingkai and Qin, Libo and Teng, Zhiyang and Liu, Xiao and Zhang, Yue},
|
||||||
|
booktitle = {Proceedings of the Twenty-Ninth International Joint Conference on
|
||||||
|
Artificial Intelligence, {IJCAI-20}},
|
||||||
|
publisher = {International Joint Conferences on Artificial Intelligence Organization},
|
||||||
|
editor = {Christian Bessiere},
|
||||||
|
pages = {3845--3852},
|
||||||
|
year = {2020},
|
||||||
|
month = {7},
|
||||||
|
note = {Main track},
|
||||||
|
doi = {10.24963/ijcai.2020/532},
|
||||||
|
url = {https://doi.org/10.24963/ijcai.2020/532},
|
||||||
|
}
|
||||||
|
@inproceedings{gao2021lmbff,
|
||||||
|
title = "Making Pre-trained Language Models Better Few-shot Learners",
|
||||||
|
author = "Gao, Tianyu and
|
||||||
|
Fisch, Adam and
|
||||||
|
Chen, Danqi",
|
||||||
|
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
|
||||||
|
month = aug,
|
||||||
|
year = "2021",
|
||||||
|
address = "Online",
|
||||||
|
publisher = "Association for Computational Linguistics",
|
||||||
|
url = "https://aclanthology.org/2021.acl-long.295",
|
||||||
|
doi = "10.18653/v1/2021.acl-long.295",
|
||||||
|
pages = "3816--3830",
|
||||||
|
abstract = "The recent GPT-3 model (Brown et al., 2020) achieves remarkable few-shot performance solely by leveraging a natural-language prompt and a few task demonstrations as input context. Inspired by their findings, we study few-shot learning in a more practical scenario, where we use smaller language models for which fine-tuning is computationally efficient. We present LM-BFF{---}better few-shot fine-tuning of language models{---}a suite of simple and complementary techniques for fine-tuning language models on a small number of annotated examples. Our approach includes (1) prompt-based fine-tuning together with a novel pipeline for automating prompt generation; and (2) a refined strategy for dynamically and selectively incorporating demonstrations into each context. Finally, we present a systematic evaluation for analyzing few-shot performance on a range of NLP tasks, including classification and regression. Our experiments demonstrate that our methods combine to dramatically outperform standard fine-tuning procedures in this low resource setting, achieving up to 30{\%} absolute improvement, and 11{\%} on average across all tasks. Our approach makes minimal assumptions on task resources and domain expertise, and hence constitutes a strong task-agnostic method for few-shot learning.",
|
||||||
|
}
|
||||||
|
@inproceedings{cui2021template,
|
||||||
|
title = "Template-Based Named Entity Recognition Using {BART}",
|
||||||
|
author = "Cui, Leyang and
|
||||||
|
Wu, Yu and
|
||||||
|
Liu, Jian and
|
||||||
|
Yang, Sen and
|
||||||
|
Zhang, Yue",
|
||||||
|
booktitle = "Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021",
|
||||||
|
month = aug,
|
||||||
|
year = "2021",
|
||||||
|
address = "Online",
|
||||||
|
publisher = "Association for Computational Linguistics",
|
||||||
|
url = "https://aclanthology.org/2021.findings-acl.161",
|
||||||
|
doi = "10.18653/v1/2021.findings-acl.161",
|
||||||
|
pages = "1835--1845",
|
||||||
|
}
|
||||||
|
@inproceedings{schick2021pet,
|
||||||
|
title = "Few-Shot Text Generation with Natural Language Instructions",
|
||||||
|
author = {Schick, Timo and Sch{\"u}tze, Hinrich},
|
||||||
|
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
|
||||||
|
month = nov,
|
||||||
|
year = "2021",
|
||||||
|
address = "Online and Punta Cana, Dominican Republic",
|
||||||
|
publisher = "Association for Computational Linguistics",
|
||||||
|
url = "https://aclanthology.org/2021.emnlp-main.32",
|
||||||
|
doi = "10.18653/v1/2021.emnlp-main.32",
|
||||||
|
pages = "390--402",
|
||||||
|
abstract = "Providing pretrained language models with simple task descriptions in natural language enables them to solve some tasks in a fully unsupervised fashion. Moreover, when combined with regular learning from examples, this idea yields impressive few-shot results for a wide range of text classification tasks. It is also a promising direction to improve data efficiency in generative settings, but there are several challenges to using a combination of task descriptions and example-based learning for text generation. In particular, it is crucial to find task descriptions that are easy to understand for the pretrained model and to ensure that it actually makes good use of them; furthermore, effective measures against overfitting have to be implemented. In this paper, we show how these challenges can be tackled: We introduce GenPET, a method for text generation that is based on pattern-exploiting training, a recent approach for combining textual instructions with supervised learning that only works for classification tasks. On several summarization and headline generation datasets, GenPET gives consistent improvements over strong baselines in few-shot settings.",
|
||||||
|
}
|
||||||
|
@inproceedings{li2021coco,
|
||||||
|
title={CoCo: Controllable Counterfactuals for Evaluating Dialogue State Trackers},
|
||||||
|
author={Shiyang Li and Semih Yavuz and Kazuma Hashimoto and Jia Li and Tong Niu and Nazneen Rajani and Xifeng Yan and Yingbo Zhou and Caiming Xiong},
|
||||||
|
booktitle={International Conference on Learning Representations},
|
||||||
|
year={2021},
|
||||||
|
url={https://openreview.net/forum?id=eom0IUrF__F}
|
||||||
|
}
|
||||||
|
@inproceedings{vaswani2017attention,
|
||||||
|
author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, \L ukasz and Polosukhin, Illia},
|
||||||
|
booktitle = {Advances in Neural Information Processing Systems},
|
||||||
|
editor = {I. Guyon and U. Von Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett},
|
||||||
|
pages = {},
|
||||||
|
publisher = {Curran Associates, Inc.},
|
||||||
|
title = {Attention is All you Need},
|
||||||
|
url = {https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf},
|
||||||
|
volume = {30},
|
||||||
|
year = {2017}
|
||||||
|
}
|
||||||
|
@inproceedings{holtzman2020topp,
|
||||||
|
title={The Curious Case of Neural Text Degeneration},
|
||||||
|
author={Ari Holtzman and Jan Buys and Li Du and Maxwell Forbes and Yejin Choi},
|
||||||
|
booktitle={International Conference on Learning Representations},
|
||||||
|
year={2020},
|
||||||
|
url={https://openreview.net/forum?id=rygGQyrFvH}
|
||||||
|
}
|
||||||
|
@inproceedings{fan2018topk,
|
||||||
|
title = "Hierarchical Neural Story Generation",
|
||||||
|
author = "Fan, Angela and
|
||||||
|
Lewis, Mike and
|
||||||
|
Dauphin, Yann",
|
||||||
|
booktitle = "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
|
||||||
|
month = jul,
|
||||||
|
year = "2018",
|
||||||
|
address = "Melbourne, Australia",
|
||||||
|
publisher = "Association for Computational Linguistics",
|
||||||
|
url = "https://aclanthology.org/P18-1082",
|
||||||
|
doi = "10.18653/v1/P18-1082",
|
||||||
|
pages = "889--898",
|
||||||
|
abstract = "We explore story generation: creative systems that can build coherent and fluent passages of text about a topic. We collect a large dataset of 300K human-written stories paired with writing prompts from an online forum. Our dataset enables hierarchical story generation, where the model first generates a premise, and then transforms it into a passage of text. We gain further improvements with a novel form of model fusion that improves the relevance of the story to the prompt, and adding a new gated multi-scale self-attention mechanism to model long-range context. Experiments show large improvements over strong baselines on both automated and human evaluations. Human judges prefer stories generated by our approach to those from a strong non-hierarchical model by a factor of two to one.",
|
||||||
|
}
|
||||||
|
@inproceedings{qi2020stanza,
|
||||||
|
title = "{S}tanza: A Python Natural Language Processing Toolkit for Many Human Languages",
|
||||||
|
author = "Qi, Peng and
|
||||||
|
Zhang, Yuhao and
|
||||||
|
Zhang, Yuhui and
|
||||||
|
Bolton, Jason and
|
||||||
|
Manning, Christopher D.",
|
||||||
|
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: System Demonstrations",
|
||||||
|
month = jul,
|
||||||
|
year = "2020",
|
||||||
|
address = "Online",
|
||||||
|
publisher = "Association for Computational Linguistics",
|
||||||
|
url = "https://aclanthology.org/2020.acl-demos.14",
|
||||||
|
doi = "10.18653/v1/2020.acl-demos.14",
|
||||||
|
pages = "101--108",
|
||||||
|
abstract = "We introduce Stanza, an open-source Python natural language processing toolkit supporting 66 human languages. Compared to existing widely used toolkits, Stanza features a language-agnostic fully neural pipeline for text analysis, including tokenization, multi-word token expansion, lemmatization, part-of-speech and morphological feature tagging, dependency parsing, and named entity recognition. We have trained Stanza on a total of 112 datasets, including the Universal Dependencies treebanks and other multilingual corpora, and show that the same neural architecture generalizes well and achieves competitive performance on all languages tested. Additionally, Stanza includes a native Python interface to the widely used Java Stanford CoreNLP software, which further extends its functionality to cover other tasks such as coreference resolution and relation extraction. Source code, documentation, and pretrained models for 66 languages are available at https://stanfordnlp.github.io/stanza/.",
|
||||||
|
}
|
||||||
|
@inproceedings{wolf2020transformers,
|
||||||
|
title = "Transformers: State-of-the-Art Natural Language Processing",
|
||||||
|
author = "Wolf, Thomas and
|
||||||
|
Debut, Lysandre and
|
||||||
|
Sanh, Victor and
|
||||||
|
Chaumond, Julien and
|
||||||
|
Delangue, Clement and
|
||||||
|
Moi, Anthony and
|
||||||
|
Cistac, Pierric and
|
||||||
|
Rault, Tim and
|
||||||
|
Louf, Remi and
|
||||||
|
Funtowicz, Morgan and
|
||||||
|
Davison, Joe and
|
||||||
|
Shleifer, Sam and
|
||||||
|
von Platen, Patrick and
|
||||||
|
Ma, Clara and
|
||||||
|
Jernite, Yacine and
|
||||||
|
Plu, Julien and
|
||||||
|
Xu, Canwen and
|
||||||
|
Le Scao, Teven and
|
||||||
|
Gugger, Sylvain and
|
||||||
|
Drame, Mariama and
|
||||||
|
Lhoest, Quentin and
|
||||||
|
Rush, Alexander",
|
||||||
|
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
|
||||||
|
month = oct,
|
||||||
|
year = "2020",
|
||||||
|
address = "Online",
|
||||||
|
publisher = "Association for Computational Linguistics",
|
||||||
|
url = "https://aclanthology.org/2020.emnlp-demos.6",
|
||||||
|
doi = "10.18653/v1/2020.emnlp-demos.6",
|
||||||
|
pages = "38--45",
|
||||||
|
abstract = "Recent progress in natural language processing has been driven by advances in both model architecture and model pretraining. Transformer architectures have facilitated building higher-capacity models and pretraining has made it possible to effectively utilize this capacity for a wide variety of tasks. Transformers is an open-source library with the goal of opening up these advances to the wider machine learning community. The library consists of carefully engineered state-of-the art Transformer architectures under a unified API. Backing this library is a curated collection of pretrained models made by and available for the community. Transformers is designed to be extensible by researchers, simple for practitioners, and fast and robust in industrial deployments. The library is available at https://github.com/huggingface/transformers.",
|
||||||
|
}
|
||||||
|
@inproceedings{kingma2015adam,
|
||||||
|
author = {Diederik P. Kingma and
|
||||||
|
Jimmy Ba},
|
||||||
|
editor = {Yoshua Bengio and
|
||||||
|
Yann LeCun},
|
||||||
|
title = {Adam: {A} Method for Stochastic Optimization},
|
||||||
|
booktitle = {3rd International Conference on Learning Representations, {ICLR} 2015,
|
||||||
|
San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings},
|
||||||
|
year = {2015},
|
||||||
|
url = {http://arxiv.org/abs/1412.6980}
|
||||||
|
}
|
||||||
@ -1,7 +1,6 @@
|
|||||||
\section{Abstract}\label{sec:abstract}
|
\section*{Abstract}\label{sec:abstract}
|
||||||
|
|
||||||
\paragraph{} First Paragraph
|
Dialog State Tracking (DST) is an essential component of task-oriented dialogue systems, which helps the dialog system understand the user's requirements for completing specific tasks. In this thesis, prompt-based methods for DST in task-oriented dialogue systems are explored by utilizing the MultiWOZ dataset. This approach does not rely on the pre-defined set of slots and their possible values. It can also be costly to label all the slots and values to train the DST models, especially for new domains. The prompt-based approach focuses on learning the DST model efficiently under low-resource few-shot settings. To examine the impact of prompt-based methods, a baseline pre-trained language model, SOLOIST\footnote{A Single Pre-trained Language Model (GPT-2) for task-oriented dialog systems}, is fine-tuned to generate belief states as a word sequence. In the prompt-based approach, the SOLOIST model is fine-tuned on limited labeled training data to generate the slots directly from values. Further, multi-prompt methods\footnote{Multi-prompt methods: \textit{Prompt Decomposition, Prompt Ensembling, Prompt Augmentation}} are applied to investigate the potential improvement in the slot generation performance. Experimental results show prompt-based methods significantly outperformed the baseline model under low-resource settings. Analysis of outputs shows prompt-based approach has some drawbacks due to the existing belief states annotation system in the MultiWOZ dataset. One limitation is that the prompt-based methods cannot generate multiple slots for repeated value candidates, as the slots are generated by passing values to the prompt function. The data, code and steps to reproduce results are publicly available \href{https://git.pavanmandava.com/pavan/master-thesis}{here}\footnote{\href{https://git.pavanmandava.com/pavan/master-thesis}{https://git.pavanmandava.com/pavan/master-thesis}}.
|
||||||
|
|
||||||
\paragraph{} Second Paragraph
|
|
||||||
|
|
||||||
\paragraph{} Third Paragraph
|
\clearpage
|
||||||
@ -1,7 +1,27 @@
|
|||||||
\section{Introduction}\label{sec:intro}
|
\section{Introduction}\label{sec:intro}
|
||||||
|
|
||||||
\paragraph{} First Paragraph
|
\paragraph{} Dialog State Tracking (DST) is an essential module in dialog systems, which is responsible for tracking the user goals in the form of dialog states based on the entire dialog history. In dialog systems, \textquote{dialog states} - also known as \textquote{belief states} contains a set of \textit{(slot, value)} pairs for each turn of the dialog history. The \textit{(slot, value)} pairs hold specific pieces of information required for the dialog system to perform the task and help in generating the responses. The values of the slots can change when the user provides more information or accepts the system recommendations. Existing data-driven methods and neural models for individual dialog modules (NLU, DST, NLG) and end-to-end dialog systems show promising results, but they need large amounts of task-specific training data, which is rarely available for new tasks. These neural DST models do not generalize well on new domains with limited data \citep{li2021coco}. For task-specific DST, collecting dialog state labels can be costly and time-consuming, requiring domain experts to indicate all possible (\textit{slot, value}) pairs for each turn of the dialog history. A typical task-oriented dialog system contains an ontology for each domain, with a pre-defined set of slots and all possible values for each domain. In real-world applications, defining all possible slots and values for DST is difficult due to the increasing number of new domains and the evolving needs of the users.
|
||||||
|
|
||||||
\paragraph{} Second Paragraph
|
\paragraph{} Prompt-based learning \textit{(\textquote{pre-train, prompt, and predict})} is a new paradigm in NLP that aims to predict the probability of text directly from the pre-trained LM. This framework is powerful as it allows the language model to be \textit{pre-trained} on massive amounts of raw text, and by defining a new prompting function the model can perform \textit{few-shot} or even \textit{zero-shot} learning \citep{liu2021ppp}. The large pre-trained language models (PLMs) are supposed to be useful in few-shot scenarios where the task-related training data is limited, as they can be probed for task-related knowledge efficiently by using a prompt. One example of such large pre-trained language models is GPT-3 \citep{brown2020gpt3} - \textit{\textquote{Language Models are Few-Shot Learners}}. \citet{madotto2021fsb} created an end-to-end chatbot (Few-Shot Bot) using \textit{prompt-based few-shot learning} learning and achieved comparable results to those of state-of-the-art. Prompting methods are particularly helpful in few-shot learning where domain-related data is limited. \textit{Fixed-prompt LM tuning} is a fine-tuning strategy for downstream tasks, where the LM parameters are tuned with fixed prompts to help LM understand the task. This can be achieved by applying a discrete textual prompt template to the data used for fine-tuning the PLM.
|
||||||
|
|
||||||
\paragraph{} Third Paragraph
|
\paragraph{} Prompt-based learning for few-shot DST with limited labeled domains is still under-explored. Recently, \citet{yang2022prompt} proposed a new prompt learning framework for few-shot DST. This work designed a \textit{value-based prompt} and an \textit{inverse prompt} mechanism to efficiently train a DST model for domains with limited training data. This approach doesn't depend on the ontology of slots and the results show that it can generate slots by prompting the tuned PLM and outperforms the existing state-of-the-art methods under few-shot settings. In this thesis, the prompt-based few-shot methods for DST are explored by implementing the following three tasks:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Prompt-based few-shot DST - reproduce the results from \citet{yang2022prompt}
|
||||||
|
\begin{itemize}
|
||||||
|
\item[--] Implement prompt-based methods for DST task under few-shot settings
|
||||||
|
\item[--] Implement a baseline model for comparing the prompt-based methods
|
||||||
|
\end{itemize}
|
||||||
|
\item Evaluation and analyses of belief state predictions
|
||||||
|
\begin{itemize}
|
||||||
|
\item[--] Evaluate the DST task using Joint Goal Accuracy (JGA) metric
|
||||||
|
\item[--] Improvements observed from the prompt-based methods
|
||||||
|
\item[--] Drawbacks of the prompt-based methods
|
||||||
|
\end{itemize}
|
||||||
|
\item Extend prompt-based methods to utilize various \textit{multi-prompt} techniques
|
||||||
|
\begin{itemize}
|
||||||
|
\item[--] Can different multi-prompt techniques help the PLM better understand the DST task?
|
||||||
|
\item[--] Evaluation of multi-prompt methods and what's the influence of various multi-prompt techniques?
|
||||||
|
\end{itemize}
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
\clearpage
|
||||||
@ -1,3 +1,67 @@
|
|||||||
\section{Background \& Related Work}\label{sec:background}
|
\section{Background \& Related Work}\label{sec:background}
|
||||||
|
|
||||||
\paragraph{}
|
\subsection{Dialog State Tracking (DST)}
|
||||||
|
|
||||||
|
\paragraph{} Task-oriented dialog systems, both modular and end-to-end systems, are capable of handling a wide range of tasks (such as ticket booking, restaurant booking, etc.) across various domains. A task-oriented dialogue system has stricter requirements for responses because it needs to accurately understand and process the user's message. Therefore, modular methods were suggested as a way to generate responses in a more controlled manner. A typical modular-based system uses a modular pipeline, which has four modules that execute sequentially - Natural Language Understanding (NLU), Dialog State Tracking (DST), Policy Learning (POL), and Natural Language Generation (NLG). The DST module is essential for enabling the system to comprehend the user's requests by tracking them in the form of slots and values (belief states) at every turn. For instance, in a dialogue system that helps users book flights, the system might track slots such as destination, departure, travel date, and number of travelers. By keeping track of these slots and their values, the system can understand the user requirements and provides this information to the next module. For example, consider the user message - \textquote{\textit{Plan a train trip to Berlin this Friday for two people}} - the DST module is supposed to extract (\textit{slot, value}) pairs as follows: \{(\textit{destination, Berlin}), (\textit{day, Friday}), (\textit{people, 2})\}. In this thesis, the focus is on the DST module for extracting slots and values.
|
||||||
|
|
||||||
|
\subsection{Pre-trained Language Models (PLMs)}
|
||||||
|
|
||||||
|
\paragraph{} Large pre-trained language models are trained on huge amounts of textual data and have achieved state-of-the-art performance in a variety of NLP tasks, such as machine translation, text classification, text generation, and summarization. These models are trained on large datasets and are able to learn the probability distribution of the words. Pre-trained language models based on transformer architectures \citep{vaswani2017attention}, such as BERT \citep{devlin2019bert} and GPT \citep{radford2018gpt}, have also achieved state-of-the-art performance on many NLP tasks. GPT-2 \citep{radford2019gpt2} is a transformer-based auto-regressive language model trained on large amounts of open web text data. GPT-2 is trained with a simple objective: predict the next word, given all previous words within some text. The architecture and training objective of the PLMs plays an important role in determining their applicability to particular prompting tasks \citep{liu2021ppp}. For example, left-to-right auto-regressive LMs predict the next word by assigning a probability to the sequence of words. For tasks that require the PLM to generate text from \textit{prefix} prompts (the entire prompt string followed by generated text), the left-to-right LMs tend to mesh well with the left-to-right nature of the language model.
|
||||||
|
|
||||||
|
\paragraph{} The baseline model of this thesis, \textsc{Soloist} \citep{peng2021soloist}, uses a 12-layer GPT-2 for building the task-oriented dialog system. \textsc{Soloist} uses the publicly available 117M-parameter GPT-2 as initialization for task-grounded pre-training. The prompt-based methods in this thesis utlize the pre-trained \textsc{Soloist} and fine-tune it to the downstream DST task.
|
||||||
|
|
||||||
|
\subsection{SOLOIST Model}
|
||||||
|
|
||||||
|
\paragraph{} \textsc{Soloist} \citep{peng2021soloist} is a task-oriented dialog system that uses transfer learning and machine teaching to build task bots at scale. \textsc{Soloist} uses the \textit{pre-train, fine-tune} paradigm for building end-to-end dialog systems using a transformer-based auto-regressive language model GPT-2 \citep{radford2019gpt2}, which subsumes different dialog modules (i.e., NLU, DST, POL, NLG) into a single model. In a \textit{pre-train, fine-tune} paradigm, a fixed \textit{pre-trained} LM is adapted to different downstream tasks by introducing additional parameters and \textit{fine-tuning} them using task-specific objective functions. In the pre-training stage, \textsc{Soloist} is initialized with the 12-layer GPT-2 (117M parameters) and further trained on large heterogeneous dialog corpora. The primary goal at this stage is to learn task completion skills such as belief state prediction (DST) and response generation. In the fine-tuning stage, the pre-trained \textsc{Soloist} model can be used to solve new tasks by just using a handful of task-specific dialogs.
|
||||||
|
|
||||||
|
\paragraph{} In this thesis, the pre-trained \textsc{Soloist} is the baseline model for generating the belief states. For the baseline DST task, the pre-trained \textsc{Soloist} is fine-tuned on the belief predictions task for open-ended text generation. For prompt-based methods, the baseline \textsc{Soloist} is fine-tuned for generating belief state slots using prompts. The results and outputs from the baseline model are compared to the prompt-based model for detailed analyses.
|
||||||
|
|
||||||
|
|
||||||
|
\subsection{Prompt Learning}
|
||||||
|
|
||||||
|
\paragraph{} Prompt-based learning (also dubbed as \textit{\textquote{pre-train, prompt, and predict}}) is a new paradigm that aims to utilize PLMs more efficiently to solve downstream NLP tasks \citep{liu2021ppp}. In this paradigm, instead of adapting pre-trained LMs to downstream tasks by designing the task-specific training objectives, downstream tasks are reformulated to look more like those solved during the original LM training with the help of a textual \textit{prompt}. To perform prediction tasks, the original input $x$ is modified using a \textit{template} into a textual \textit{prompt} $x^{\prime}$ that has some unfilled slots, and then the PLM is used to probabilistically fill the unfilled information to obtain a final string $z$, from which the final output $y$ can be derived. For text generation tasks, the generated answer $z$ itself is the output $y$.
|
||||||
|
\vspace{4pt}
|
||||||
|
|
||||||
|
\begin{table}[!ht]
|
||||||
|
\centering
|
||||||
|
\begingroup
|
||||||
|
\setlength{\tabcolsep}{12pt} % Default value: 6pt
|
||||||
|
\renewcommand{\arraystretch}{1.4} % Default value: 1
|
||||||
|
\begin{tabular}{l c l}
|
||||||
|
\hline
|
||||||
|
\textbf{Name} & \textbf{Notation} & \textbf{Example}\\
|
||||||
|
\hline
|
||||||
|
\textit{Input} & $x$ & I missed the bus today. \\
|
||||||
|
\textit{Output} & $y$ & sad \\
|
||||||
|
\hline
|
||||||
|
\textit{Prompt Function} & $f_{prompt}(x)$ & $[X]$ I felt so $[Z]$ \\
|
||||||
|
\hline
|
||||||
|
\textit{Prompt} & $x^{\prime}$ & I missed the bus today. I felt so $[Z]$ \\
|
||||||
|
\textit{Answered Prompt} & $f_{fill}(x^{\prime}, z)$ & I missed the bus today. I felt so sad \\
|
||||||
|
\hline
|
||||||
|
\textit{Answer} & $z$ & \textit{happy}, \textit{sad}, \textit{scared} \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\endgroup
|
||||||
|
\caption{Terminology and notations of prompting methods}
|
||||||
|
\label{table:1}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
\paragraph{} For example, to recognize the emotion in the text, where \textit{input} $x = $\textquote{I missed the bus today.}, \textit{the prompt function} (also called \textit{template}) may take the form such as \textquote{$[X]$ I felt so $[Z]$}. $[X]$ takes the input text and $[Z]$ is supposed to be generated by the LM. Then, the \textit{prompt} $x^{\prime}$ would become \textquote{I missed the bus today. I felt so $[Z]$} and ask the PLM to fill the slot $[Z]$ with an emotion-bearing word. For some text generation tasks, the answer mapping from $z$ to $y$ may not be required, as the generated text itself becomes output. There are two main varieties of prompts: \textit{cloze prompts}, where the slot $[Z]$ is to be filled in the middle of the text, and \textit{prefix prompts}, where the input text comes entirely before $[Z]$. In general, for tasks that are being solved using a standard auto-regressive LM, prefix prompts tend to be more helpful, as they mesh well with the left-to-right nature of the model.
|
||||||
|
|
||||||
|
\paragraph{} Prompt-based methods can be used without any explicit training of the LM for the downstream task, simply by taking a suitable pre-trained LM and applying the prompts defined for the task. This approach is traditionally called \textit{zero-shot learning}. \textit{Few-shot learning} is another approach where only a small number of data samples are used to train the language model. Prompting methods are particularly useful under few-shot settings, as there is generally not enough training data to fully specify the desired behavior. \textit{Fixed-prompt LM tuning} is a training strategy that fine-tunes the parameters of the LM, as in the standard \textit{pre-train fine-tune} paradigm, by using discrete prompts (\textit{hard prompts}) to help PLM understand the downstream task. This approach can potentially lead to improvements, particularly in few-shot scenarios.
|
||||||
|
|
||||||
|
\subsection{Prompt-based DST}
|
||||||
|
|
||||||
|
\paragraph{} Previous work by \citet{lee2021sdp} uses belief state slots in the prompts, along with the natural language descriptions of the schema for generating the corresponding values. This \textit{slot-based} prompt DST approach uses encoder-decoder LM with a bi-directional encoder. This method relies on the known ontology of the slots and requires a lot of training data for fine-tuning PLM. In real-world applications, defining all possible slots is difficult due to the rising new domains and users' continuous needs. \citet{yang2022prompt} proposed a new prompt-learning framework for DST that uses values in prompts (\textit{value-based}) and generates slots directly from the PLM. This \textit{value-based} prompt approach does not rely on the ontology of the slots and their natural language descriptions. In task-oriented dialog systems, the prompt-based DST methods are still under-explored. In this thesis, the value-based prompt approach is applied for few-shot DST.
|
||||||
|
|
||||||
|
\subsection{MultiWOZ Dataset}
|
||||||
|
\paragraph{} MultiWOZ \citep{budzianowski2018multiwoz} is a multi-domain task-oriented dialogue dataset that contains over 10K dialogues across 8 domains. It is a fully-labeled collection of human-human written conversations and has been a widely used dataset for benchmarking DST methods. \citet{eric2019multiwoz} released MultiWOZ 2.1 after fixing the noisy dialog state annotations and utterances that negatively impact the performance of DST models. In this thesis, MultiWOZ 2.1 is used to benchmark both baseline and prompt-based methods.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,3 +1,194 @@
|
|||||||
\section{Methods}\label{sec:methods}
|
\section{Methods}\label{sec:methods}
|
||||||
|
|
||||||
\paragraph{}
|
This section describes the research methods and experimental setup of the work conducted in this thesis. This thesis work can be divided into the following tasks: \textsc{Soloist} baseline implementation for few-shot DST, prompt-based methods for few-shot DST, evaluation and analysis of belief state predictions, and multi-prompt methods for DST.
|
||||||
|
|
||||||
|
\subsection{Dataset}
|
||||||
|
The baseline and prompt-based methods are benchmarked on MultiWOZ 2.1 \citep{eric2019multiwoz} dataset. The MultiWOZ dataset contains 8438/1000/1000 single-domain and multi-domain dialogues for training/validation/testing respectively. Each dialogue can have multiple turns and each turn can include multiple \textit{(slot, value)} pairs. Dialogues from only five domains (\textit{Restaurant, Hotel, Attraction, Taxi, Train}) and one sub-domain (\textit{Booking}) are used in the experiments, as the other two domains (\textit{Hospital, Police}) only appear in the training set. To observe the performance under few-shot settings, dialogues are randomly sampled for each domain and six different data splits are created. Each data split contains all five domains and dialogues are evenly distributed. Only single-domain dialogues including booking sub-domain are picked for creating the data splits. Validation and test sets are not sampled after domain filtering. Table \ref{table:2} provides data statistics and the summary of data splits used in few-shot experiments.
|
||||||
|
|
||||||
|
\begin{table}[h!]
|
||||||
|
\centering
|
||||||
|
\begingroup
|
||||||
|
\setlength{\tabcolsep}{10pt} % Default value: 6pt
|
||||||
|
\renewcommand{\arraystretch}{1.2} % Default value: 1
|
||||||
|
\begin{tabular}{|l|c|c|c|}
|
||||||
|
\hline
|
||||||
|
\textbf{Data Splits} & \textbf{\# Dialogues} & \textbf{\# Total Turns} & \textbf{\# (slot, value)} \\
|
||||||
|
\hline
|
||||||
|
\textsl{5-dpd} & 25 & 100 & 294 \\ \hline
|
||||||
|
\textsl{10-dpd} & 50 & 234 & 758 \\ \hline
|
||||||
|
\textsl{50-dpd} & 250 & 1114 & 3535 \\ \hline
|
||||||
|
\textsl{100-dpd} & 500 & 2292 & 7408 \\ \hline
|
||||||
|
\textsl{125-dpd} & 625 & 2831 & 9053 \\ \hline
|
||||||
|
\textsl{250-dpd} & 1125 & 5187 & 17214 \\ \hline
|
||||||
|
\textsl{valid} & 190 & 900 & 3106 \\ \hline
|
||||||
|
\textsl{test} & 193 & 894 & 3411 \\ \hline
|
||||||
|
\end{tabular}
|
||||||
|
\endgroup
|
||||||
|
\caption{Data statistics and data split summary for few-shot experiments. The term \textsl{dpd} means \textsl{\textquote{dialogues per domain}}. Each split contains dialogues for all five domains. In data split \textsl{250-dpd}, the domain \textquote{\textit{Attraction}} contains only 125 dialogues.}
|
||||||
|
\label{table:2}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
\paragraph{} In the MultiWOZ 2.1 dataset, 16 dialog slots are used to understand the user requirements. For the prompt-based experiments, these slots are converted to look like natural language words for fine-tuning the slot generation process. Table \ref{table:3} lists the slots from all five domains and \textit{booking} sub-domain.
|
||||||
|
|
||||||
|
\begin{table}[!ht]
|
||||||
|
\centering
|
||||||
|
\begin{tabular}{l}
|
||||||
|
\hline
|
||||||
|
\multicolumn{1}{c}{\textbf{Slots}} \\
|
||||||
|
\hline
|
||||||
|
\textsl{area, arrive, day, departure, destination, food, internet, leave,} \\
|
||||||
|
\textsl{name, parking, people, price, stars, stay, time, type} \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\caption{Slots from MultiWOZ 2.1 dataset used in prompt-based experiments}
|
||||||
|
\label{table:3}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
|
||||||
|
\subsection{SOLOIST Baseline}
|
||||||
|
\textsc{Soloist} \citep{peng2021soloist} is the baseline model for the prompt-based methods. \textsc{Soloist} is initialized with the 12-layer GPT-2 \citep{radford2019gpt2} and further trained on two task-oriented dialog corpora (Schema and Taskmaster). The task-grounded pre-training helps the \textsc{Soloist} model to solve two dialog-related tasks: \textit{belief state prediction} and \textit{response generation}. In the belief state predictions task, the model takes dialog history as input and generates the belief states as a sequence of words. In this thesis, for the baseline implementation, the pre-trained \textsc{Soloist} is fine-tuned on MultiWOZ 2.1 data splits to perform the belief predictions task. During inference time, the fine-tuned \textsc{Soloist} baseline doesn't need the pre-defined set of slots and their possible values, and it uses top-K \citep{fan2018topk} and top-p or nucleus \citep{holtzman2020topp} sampling for generating the belief states. In the prompt-based DST task, the same pre-trained \textsc{Soloist} model is fine-tuned for prompt-based slot generation.
|
||||||
|
|
||||||
|
\subsection{Prompt-based few-shot DST}
|
||||||
|
This task aims to apply prompt-based methods proposed by \citep{yang2022prompt} and reproduce the results. This task utilizes the \textit{value-based prompt} and \textit{inverse prompt} for fine-tuning the pre-trained \textsc{Soloist}, which can generate the belief state slots directly at inference time. The prompt-based methods are evaluated on the same data splits (Table \ref{table:2}) of the MultiWOZ 2.1 dataset.
|
||||||
|
|
||||||
|
\paragraph{Value-based prompt} An intuitive idea for generating (\textit{slot, value}) pairs is to use slots in prompts and generate the corresponding values \citep{lee2021sdp}. For example, given the utterance - \textquote{\textsl{Plan a trip to Berlin}} and slot (\textsl{destination}), the prompt to the PLM could become \textquote{\textsl{Plan a trip to Berlin. destination = [z]}} and the PLM is expected to generate \textsl{[z]} as \textquote{\textsl{Berlin}}. However, this approach relies on the ontology of the slots, and the fixed set of slots can change in real-world applications. \citet{yang2022prompt} proposed \textit{value-based prompt} that uses values in the prompts and generates corresponding slots. This method doesn't require any pre-defined set of slots and can generate slots directly from the PLM. Consider this prompt template: \textquote{\textsl{belief states: value = [v], slot = [s]}}, the prompt function $f$ can be of form $f(v) = $ \textsl{[dialog history] belief states: value = [v], slot = [s]}, given the value candidate $v = $ \textquote{\textsl{Berlin}}, the PLM can generate \textsl{slot [s] = \textquote{destination}}. The overall training objective of value-based prompt generation is minimizing the negative log-likelihood of slots in the training dataset $D$:
|
||||||
|
\begin{equation} \label{eq:1}
|
||||||
|
\mathcal{L}=-\sum_{t}^{|D|} \log P\left(s_{t} \mid c_{t}, f\left(v_{t}\right)\right)
|
||||||
|
\end{equation}
|
||||||
|
where $P\left(s_{t} \mid c_{t}, f\left(v_{t}\right)\right)$ is the probability of generating slot $s_t$ given dialog history $c_t$ and prompt-function $f$ is filled with value $v_t$ for each turn $t$.
|
||||||
|
The loss $\mathcal{L}$ from this step is combined with the loss from inverse prompt (next step) in order to compute the final loss. During training, the annotated values from the dataset are utilized to fill in the value-based prompts.
|
||||||
|
|
||||||
|
\paragraph{Inverse Prompt} The \textit{inverse prompt} mechanism \citep{yang2022prompt} aims to generate the values by filling the prompts with generated slots. After generating slot $s$ using the value-based prompt, this generated slot is presented to the inverse prompt function $I$. The inverse prompt aims to generate the value $v^{\prime}$ which is supposed to be close to the original value $v$. The prompt template for inverse prompt function can be of form $I = $ \textquote{\textsl{belief states: slot = [s], value = [v]}}. \textsl{[s]} is filled with the generated slot from value-based prompt, \textsl{[v]} is expected to be generated by the PLM. This inverse prompt can be considered as an auxiliary task for the value-based prompt, which can improve performance by helping the PLM understand the task, especially under low-resource scenarios. The loss function $\tilde{\mathcal{L}}$ for the inverse prompt mechanism:
|
||||||
|
\begin{equation} \label{eq:2}
|
||||||
|
\tilde{\mathcal{L}}=-\sum_{t}^{|D|} \log P\left(v^{\prime}_{t} \mid c_{t}, I\left(s_{t}\right)\right)
|
||||||
|
\end{equation}
|
||||||
|
where $P\left(v^{\prime}_{t} \mid c_{t}, I\left(s_{t}\right)\right)$ is the probability of generating the value $v^{\prime}_{t}$ by filling in the inverse prompt $I\left(s_{t}\right)$ with the generated slot $s_{t}$.
|
||||||
|
|
||||||
|
\noindent The final loss $\mathcal{L}^{*}$ is computed by combining loss from value-based prompt $\mathcal{L}$ and the inverse prompt loss $\tilde{\mathcal{L}}$:
|
||||||
|
\begin{equation} \label{eq:3}
|
||||||
|
\mathcal{L}^{*} = \mathcal{L} + w *\tilde{\mathcal{L}}
|
||||||
|
\end{equation}
|
||||||
|
where $w$ is a decimal value (0,1) which is used to adjust the influence of inverse prompt.
|
||||||
|
|
||||||
|
\paragraph{Training} For training the prompt-based methods, the pre-trained Soloist (GPT-2 117M) is fine-tuned on value-based prompt and inverse prompt. All the MultiWOZ 2.1 data splits (Table 2) are used in the fine-tuning process in order to evaluate the performance under few-shot settings. The training strategy fixed-prompt LM tuning is adapted for tuning the prompt-based methods, where the fixed discrete prompts are used to fine-tune the parameters of the LM. Table \ref{table:4} shows the prompt templates used in the fine-tuning process. The prompts are appended to the dialog history before providing them as input to the PLM and probabilistically generate the missing slots. The inverse prompt is only used during the training phase. Experiments are also performed to evaluate the influence of inverse prompt by omitting it during the training process. \\
|
||||||
|
|
||||||
|
\begin{table}[h!]
|
||||||
|
\centering
|
||||||
|
\begingroup
|
||||||
|
\setlength{\tabcolsep}{10pt} % Default value: 6pt
|
||||||
|
\renewcommand{\arraystretch}{1.35} % Default value: 1
|
||||||
|
\begin{tabular}{ll}
|
||||||
|
\hline
|
||||||
|
\multicolumn{1}{c}{\textbf{Type}} & \multicolumn{1}{c}{\textbf{Prompt templates}} \\
|
||||||
|
\hline
|
||||||
|
value-based prompt & belief states: value = [v], slot = [s] \\
|
||||||
|
inverse prompt & belief states: slot = [s], value = [v] \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\endgroup
|
||||||
|
\caption{Prompt templates used during the training phase.}
|
||||||
|
\label{table:4}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
\paragraph{Testing (Slot Generation)} During the testing phase, only value-based prompts are used to generate the slots. The filled prompt together with the dialog history is given as input to the PLM, and the next word with the highest probability is the generated slot. While testing, the value candidates are not known. A set of rules are applied to extract the candidate values directly from the user utterances. This sort of value extraction from utterances is previously explored by \citet{min2020dsi}.
|
||||||
|
|
||||||
|
\paragraph{Value Extraction} Value candidates are extracted directly from the dialog history and are provided to the value-based prompts for generating slots at inference time. Stanford CoreNLP Stanza \citep{qi2020stanza} tool is used to first extract POS tags and named entities, a set of rules are then applied to extract the candidate values.
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\item Adjectives (JJ) and Adverbs (RB) are considered as possible values
|
||||||
|
\begin{itemize}
|
||||||
|
\item[$\circ$] E.g., \textsl{expensive}, \textsl{moderate}, \textsl{important}
|
||||||
|
\end{itemize}
|
||||||
|
\item Consider the previous negator `not'
|
||||||
|
\begin{itemize}
|
||||||
|
\item[$\circ$] E.g., \textsl{not expensive}, \textsl{not important (= dont care)}
|
||||||
|
\end{itemize}
|
||||||
|
\item Consider all named entities (name of place, time, date/day, numbers)
|
||||||
|
\begin{itemize}
|
||||||
|
\item[$\circ$] E.g., \textsl{cambridge}, \textsl{friday}, \textsl{08:30}
|
||||||
|
\end{itemize}
|
||||||
|
\item Custom set of Regex NER rules are applied for recognizing named entities
|
||||||
|
\begin{itemize}
|
||||||
|
\item[$\circ$] E.g., \textsl{restaurant names}, \textsl{attraction names}
|
||||||
|
\end{itemize}
|
||||||
|
\item Stop words and repeated candidate values are filtered out
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\paragraph{Prompt Decomposition} For utterances where multiple \textsl{(slot, value)} pairs are expected to be predicted, directly using a single prompt for generating multiple slots is challenging. Prompt decomposition is a multi-prompt method that breaks down the prompt into sub-prompts and generates the slots separately for each sub-prompt. For each extracted value from the utterances, a value-based prompt is constructed and the corresponding slot is generated. This sort of prompt decomposition has been explored by \citet{cui2021template} for the named entity recognition (NER) task. This approach is applied in both the training and testing phases.
|
||||||
|
|
||||||
|
\vspace{0.25cm}
|
||||||
|
\begin{table}[h!]
|
||||||
|
\centering
|
||||||
|
\begingroup
|
||||||
|
\setlength{\tabcolsep}{8pt} % Default value: 6pt
|
||||||
|
\renewcommand{\arraystretch}{1.2} % Default value: 1
|
||||||
|
\begin{tabular}{ c l }
|
||||||
|
\hline
|
||||||
|
Utterance: & Book a flight to Berlin on friday at 08:30.\\
|
||||||
|
\hline
|
||||||
|
Prompt 1: & belief states: value = \textsl{Stuttgart}, slot = [s]\\
|
||||||
|
Prompt 2: & belief states: value = \textsl{friday}, slot = [s]\\
|
||||||
|
Prompt 3: & belief states: value = \textsl{08:30}, slot = [s]\\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\endgroup
|
||||||
|
\caption{Sub-prompts for an utterance with multiple values.}
|
||||||
|
\label{table:5}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
\subsection{Multi-prompt methods for DST}
|
||||||
|
The \textsl{value-based} prompt described in the previous section utilizes a \textsl{single} prompt for making predictions. However, a significant body of research has demonstrated that the use of multiple prompts can further improve the efficacy of prompting methods \citep{liu2021ppp}. There are different ways to extend the single prompt learning to use multiple prompts. This task explores two more multi-prompt learning methods: \textit{Prompt Ensembling} and \textit{Prompt Augmentation}. Experiments are performed on all the data splits of MultiWOZ 2.1 dataset. This task aims to answer the following questions - \textsf{Q1:} Can different \textsl{multi-prompt} techniques together help the PLM better understand the DST task? \textsf{Q2:} Can the use of multiple discrete prompts improve the performance of prompt-based model?
|
||||||
|
|
||||||
|
\paragraph{Prompt Ensembling} This method uses multiple \textit{value-based} prompts during the training and inference time. This idea can leverage the complementary advantages of different prompts and stabilize the performance on the downstream task. \citet{yang2022prompt} applied prompt ensembling to the value-based prompt by training a separate model for each prompt. Another way is to train a single model with multiple prompts as it is much faster and more memory efficient than having to train a separate model for each prompt \citep{schick2021pet}. Prompt ensembling is applied only to value-based prompts, and the inverse prompt uses a single prompt. In this task, four hand-crafted prompt templates are chosen for value-based prompts and trained on a single model. The probability of generated slot $s_t$ over multiple prompt functions is calculated by weighted averaging the probability of each prompt:
|
||||||
|
\begin{equation} \label{eq:4}
|
||||||
|
P\left(s_{t} \mid c_{t}\right)=\sum_{k}^{|K|} \alpha_{k} * P\left(s_{t} \mid c_{t}, f_{k}\left(v_{t}\right)\right)
|
||||||
|
\end{equation}
|
||||||
|
where $|K|$ represents the number of prompt functions, $f_{k}$ is the $k$-th prompt function, $\alpha_{k}$ is the weight of prompt $k$. During inference, a simple majority voting is used to pick the generated slot from multiple prompts. When there's no simple majority in the generated slots, the slot with the highest probability is picked. Table \ref{table:6} lists all the prompt templates used in prompt ensembling.
|
||||||
|
|
||||||
|
\vspace{0.25cm}
|
||||||
|
\begin{table}[h!]
|
||||||
|
\centering
|
||||||
|
\begingroup
|
||||||
|
\setlength{\tabcolsep}{8pt} % Default value: 6pt
|
||||||
|
\renewcommand{\arraystretch}{1.2} % Default value: 1
|
||||||
|
\begin{tabular}{c l}
|
||||||
|
\hline
|
||||||
|
\multicolumn{2}{c}{\textbf{Prompt ensemble templates}}\\
|
||||||
|
\hline
|
||||||
|
$f_{1}$ & belief states: [v] = [s]\\
|
||||||
|
$f_{2}$ & [v] is the value of [s]\\
|
||||||
|
$f_{3}$ & [v] is of slot type [s]\\
|
||||||
|
$f_{4}$ & belief states: value = [v], slot = [s]\\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\endgroup
|
||||||
|
\caption{Prompt templates used for prompt ensemble.}
|
||||||
|
\label{table:6}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
\paragraph{Prompt Augmentation} \textit{Prompt Augmentation}, sometimes also called \textit{demonstration learning} \citep{gao2021lmbff}, provides a few additional \textit{answered prompts} that can demonstrate to the PLM, how the actual value-based prompt can be answered. These demonstrations take advantage of the language models' ability to learn repetitive patterns. The sample selection of answered prompts is hand-crafted from the training data. At inference time, the answered prompts are appended to the input before asking the PLM to generate the slot. Table \ref{table:7} below provides an example of demonstration learning.
|
||||||
|
|
||||||
|
\begin{table}[h!]
|
||||||
|
\centering
|
||||||
|
\begin{tabular}{ r l }
|
||||||
|
\hline
|
||||||
|
\multicolumn{2}{c}{\textbf{Demonstration learning}} \\
|
||||||
|
\hline
|
||||||
|
Book a cheap flight to Frankfurt. & \textit{Frankfurt} is of slot \textit{destination}\\
|
||||||
|
Plan a train trip to Berlin. & \textit{Berlin} is of slot \textit{destination}\\
|
||||||
|
Book a taxi to the University. & \textit{University} is of slot \textit{destination}\\
|
||||||
|
Book a train to Stuttgart. & \textit{Stuttgart} is of slot [s]\\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\caption{Example prompt augmentation with answered prompts}
|
||||||
|
\label{table:7}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
\subsection{Evaluation Metrics}
|
||||||
|
The standard evaluation metric joint goal accuracy (JGA) is adopted to evaluate the belief state predictions of baseline and prompt-based methods. This metric compares all the predicted belief states to the ground-truth states at each turn. The prediction is correct only if all the predicted belief states match the ground-truth states. Both slots and values must exactly match for the belief state prediction to be correct. The rule-based methods used in value extraction can lead to many false positives in the value candidates. In order to exclude the influence of wrongly extracted values, \citet{yang2022prompt} proposed JGA*, the joint goal accuracy is computed only for the belief states where the values are extracted correctly. These evaluation metrics answer the following questions: \textsf{Q1:} How do the prompt-based methods perform overall compared to the Soloist baseline? \textsf{Q2:} Can the prompt-based methods perform better under low-resource settings? \textsf{Q3:} For prompt-based methods, does JGA* metric hold a better score than JGA? \textsf{Q4:} Can multi-prompt techniques together perform better than a single-prompt?
|
||||||
|
|
||||||
|
\paragraph{Analysis}The belief state predictions from the \textsc{Soloist} baseline and prompt-based methods are analyzed to identify the potential improvements and drawbacks. A detailed qualitative analysis is performed on the wrong belief state predictions. Additionally, error analysis is also performed on the rule-based value extraction methods to identify the impact on the slot generation process.
|
||||||
|
|
||||||
|
\subsection{Implementation Details}
|
||||||
|
For the \textsc{Soloist} baseline, the existing implementation by \citet{peng2021soloist} is adapted to the few-shot experiments conducted in this thesis. There's no publicly available implementation of the prompt-based methods for DST. Huggingface Transformers \citep{wolf2020transformers} library is used to implement the prompt-based DST methods from scratch. Adam \citep{kingma2015adam} optimization algorithm is used during the fine-tuning process of both baseline and prompt-based methods. The rule-based value extraction methods are implemented using Stanford CoreNLP client stanza \citep{qi2020stanza}. The inverse prompt weight $w$ in Eq. \ref{eq:3} is set to 0.1. The prompt weight $\alpha_{k}$ in Eq. \ref{eq:4} is set to the same value (1/4) for all the prompts used in prompt ensembling.
|
||||||
|
|
||||||
|
|||||||
@ -1,3 +1,132 @@
|
|||||||
\section{Results}\label{sec:results}
|
\section{Results}\label{sec:results}
|
||||||
|
This section presents the experimental results evaluated on all the methods described in the previous sections. Few-shot experiments are performed on all the data splits (see Table \ref{table:2}) for every method. The baseline Soloist model is evaluated only on the JGA metric. For the prompt-based methods, in addition to the JGA metric, JGA* is also computed.
|
||||||
|
|
||||||
\paragraph{}
|
\subsection{SOLOIST Baseline}
|
||||||
|
|
||||||
|
Table \ref{table:8} shows the results of the baseline model under few-shot experiments. Experimental results show the baseline model performed poorly and struggled to generate belief states under low-resource settings (\textsl{5-dpd}, \textsl{10-dpd}, \textsl{50-dpd}). Under low-resource data splits, the limited size of data samples made it challenging for the baseline to generate unseen belief states. The results also show that more data may be necessary as the model achieves better results on the data splits with a higher number of data samples (\textsl{125-dpd}, \textsl{250-dpd}).
|
||||||
|
|
||||||
|
\begin{table}[h!]
|
||||||
|
\centering
|
||||||
|
\begingroup
|
||||||
|
\setlength{\tabcolsep}{16pt} % Default value: 6pt
|
||||||
|
\renewcommand{\arraystretch}{1.15} % Default value: 1
|
||||||
|
\begin{tabular}{lc}
|
||||||
|
\hline
|
||||||
|
\textbf{\makecell{Data split (\# dialogs)}} & \textbf{JGA} \\
|
||||||
|
\hline
|
||||||
|
\textsl{5-dpd} (25) & 9.06 \\
|
||||||
|
\hline
|
||||||
|
\textsl{10-dpd} (50) & 14.20 \\
|
||||||
|
\hline
|
||||||
|
\textsl{50-dpd} (250) & 28.64 \\
|
||||||
|
\hline
|
||||||
|
\textsl{100-dpd} (500) & 33.11 \\
|
||||||
|
\hline
|
||||||
|
\textsl{125-dpd} (625) & 35.79 \\
|
||||||
|
\hline
|
||||||
|
\textsl{250-dpd} (1125) & \textbf{40.38} \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\endgroup
|
||||||
|
\caption{Few-shot experimental results of the \textsc{Soloist} baseline model. The term \textquote{\textsl{dpd}} stands for \textquote{\textsl{dialogues per domain}}.}
|
||||||
|
\label{table:8}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
\subsection{Prompt-based methods}
|
||||||
|
|
||||||
|
Table \ref{table:9} shows the results of the prompt-based model under few-shot experiments. Only a single value-based prompt is used in these experiments. Experimental results show the prompt-based model significantly outperforms the baseline model in all data splits. For low-resource data splits like \textsl{5-dpd}, \textsl{10-dpd}, and \textsl{50-dpd}, the prompt-based model shows a substantial improvement over the baseline, achieving an increase in the JGA metric by \textit{21}, \textit{28}, and \textit{18} points respectively.
|
||||||
|
\vspace{0.25cm}
|
||||||
|
\begin{table}[h!]
|
||||||
|
\centering
|
||||||
|
\begingroup
|
||||||
|
\setlength{\tabcolsep}{14pt} % Default value: 6pt
|
||||||
|
\renewcommand{\arraystretch}{1.15} % Default value: 1
|
||||||
|
\begin{tabular}{lcc}
|
||||||
|
\hline
|
||||||
|
\textbf{\makecell{Data split (\# dialogs)}} & \textbf{JGA} & \textbf{JGA*}\\
|
||||||
|
\hline
|
||||||
|
\textsl{5-dpd} (25) & 30.66 & 71.04 \\
|
||||||
|
\hline
|
||||||
|
\textsl{10-dpd} (50) & 42.65 & 86.43 \\
|
||||||
|
\hline
|
||||||
|
\textsl{50-dpd} (250) & 47.06 & 91.63 \\
|
||||||
|
\hline
|
||||||
|
\textsl{100-dpd} (500) & \textbf{47.74} & \textbf{92.31} \\
|
||||||
|
\hline
|
||||||
|
\textsl{125-dpd} (625) & 46.49 & 91.86 \\
|
||||||
|
\hline
|
||||||
|
\textsl{250-dpd} (1125) & 47.06 & 92.08 \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\endgroup
|
||||||
|
\caption{Few-shot experimental results from the prompt-based model. Only a single \textit{value-based prompt} is used. The term \textquote{\textsl{dpd}} stands for \textquote{\textsl{dialogues per domain}}.}
|
||||||
|
\label{table:9}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
\paragraph{} The prompt-based results also show that by increasing the number of data samples in experiments, the model only achieved minor performance improvements. For example, the prompt-based methods perform nearly identical on the data splits 50-dpd and 250-dpd. This suggests the prompt-based approach understands the DST task better under low-resource scenarios. The higher values of the JGA* metric across all data splits indicate the potential drawbacks of the rule-based value extraction methods.
|
||||||
|
|
||||||
|
\subsection{Multi-prompt methods}
|
||||||
|
|
||||||
|
\subsubsection{Prompt Ensembling results}
|
||||||
|
|
||||||
|
Table \ref{table:10} shows the results of prompt ensembling under few-shot settings. The results from the prompt ensemble show a slight improvement over a single value-based prompt. Contrary to expectations, the prompt ensemble model did not show significant performance improvement on the JGA metric. The results also show that the performance of the prompt ensemble model is similar when trained on large data splits, i.e. \textsl{50-dpd}, \textsl{100-dpd}, \textsl{125-dpd}, \textsl{250-dpd}.
|
||||||
|
\vspace{0.25cm}
|
||||||
|
\begin{table}[h!]
|
||||||
|
\centering
|
||||||
|
\begingroup
|
||||||
|
\setlength{\tabcolsep}{14pt} % Default value: 6pt
|
||||||
|
\renewcommand{\arraystretch}{1.15} % Default value: 1
|
||||||
|
\begin{tabular}{lcc}
|
||||||
|
\hline
|
||||||
|
\textbf{\makecell{Data split (\# dialogs)}} & \textbf{JGA} & \textbf{JGA*}\\
|
||||||
|
\hline
|
||||||
|
\textsl{5-dpd} (25) & 30.09 & 69.23 \\
|
||||||
|
\hline
|
||||||
|
\textsl{10-dpd} (50) & 42.84 & 86.99 \\
|
||||||
|
\hline
|
||||||
|
\textsl{50-dpd} (250) & 47.62 & 91.74 \\
|
||||||
|
\hline
|
||||||
|
\textsl{100-dpd} (500) & \textbf{48.08} & \textbf{92.87} \\
|
||||||
|
\hline
|
||||||
|
\textsl{125-dpd} (625) & 46.96 & 92.08 \\
|
||||||
|
\hline
|
||||||
|
\textsl{250-dpd} (1125) & \textbf{48.08} & \textbf{92.87} \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\endgroup
|
||||||
|
\caption{Few-shot experimental results from prompt ensembling (multi-prompt method). Four \textit{value-based prompts} are used at training and inference time. The term \textquote{\textsl{dpd}} stands for \textquote{\textsl{dialogues per domain}}.}
|
||||||
|
\label{table:10}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
|
||||||
|
\subsubsection{Prompt Augmentation results}
|
||||||
|
Table \ref{table:11} shows the results of prompt augmentation under few-shot settings.
|
||||||
|
\vspace{0.25cm}
|
||||||
|
\begin{table}[h!]
|
||||||
|
\centering
|
||||||
|
\begingroup
|
||||||
|
\setlength{\tabcolsep}{14pt} % Default value: 6pt
|
||||||
|
\renewcommand{\arraystretch}{1.15} % Default value: 1
|
||||||
|
\begin{tabular}{lcc}
|
||||||
|
\hline
|
||||||
|
\textbf{\makecell{Data split (\# dialogs)}} & \textbf{JGA} & \textbf{JGA*}\\
|
||||||
|
\hline
|
||||||
|
\textsl{5-dpd} (25) & --- & --- \\
|
||||||
|
\hline
|
||||||
|
\textsl{10-dpd} (50) & --- & --- \\
|
||||||
|
\hline
|
||||||
|
\textsl{50-dpd} (250) & --- & --- \\
|
||||||
|
\hline
|
||||||
|
\textsl{100-dpd} (500) & --- & --- \\
|
||||||
|
\hline
|
||||||
|
\textsl{125-dpd} (625) & --- & --- \\
|
||||||
|
\hline
|
||||||
|
\textsl{250-dpd} (1125) & --- & --- \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\endgroup
|
||||||
|
\caption{Experimental results from demonstration learning (multi-prompt method). The term \textquote{\textsl{dpd}} stands for \textquote{\textsl{dialogues per domain}}.}
|
||||||
|
\label{table:11}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
\newpage
|
||||||
@ -1,3 +1,192 @@
|
|||||||
\section{Analysis}\label{sec:analysis}
|
\section{Analysis}\label{sec:analysis}
|
||||||
|
|
||||||
\paragraph{}
|
\subsection{Error analysis of baseline model}
|
||||||
|
|
||||||
|
\begin{table}[h!]
|
||||||
|
\centering
|
||||||
|
\begingroup
|
||||||
|
\setlength{\tabcolsep}{6pt} % Default value: 6pt
|
||||||
|
\renewcommand{\arraystretch}{1.3} % Default value: 1
|
||||||
|
\begin{tabular}{lp{10.25cm}}
|
||||||
|
\hline
|
||||||
|
\multicolumn{2}{c}{\textbf{Wrong belief state predictions}}\\
|
||||||
|
\hline
|
||||||
|
\textbf{Dialog History} & \parbox{10.25cm}{
|
||||||
|
\vspace{.25\baselineskip}
|
||||||
|
\textsf{user:} we need to find a guesthouse of moderate price.\newline \textsf{system:} do you have any special area you would like to stay? or possibly a star request for the guesthouse?\newline \textsf{user:} i would like it to have a 3 star rating.} \\
|
||||||
|
\textbf{True belief states} & \textsl{(type, guesthouse) (pricerange, moderate) (stars, 3)} \\
|
||||||
|
\textbf{Generated states} & \textsl{(parking, yes) (stars, 3)} \\
|
||||||
|
\hline
|
||||||
|
\textbf{Dialog History} & \parbox{10.25cm}{
|
||||||
|
\vspace{.25\baselineskip}
|
||||||
|
\textsf{user:} i need an expensive place to eat in the west.\newline
|
||||||
|
\textsf{system:} is there a specific type of food you would like?\newline
|
||||||
|
\textsf{user:} yes, i would like eat indian food.} \\
|
||||||
|
\textbf{True belief states} &\textsl{(area, west) (food, indian) (pricerange, expensive)} \\
|
||||||
|
\textbf{Generated states} &\textsl{(area, west) (food, indian) (pricerange, cheap) (area, east)} \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\endgroup
|
||||||
|
\caption{Examples of a wrongly generated belief states by the baseline model.}
|
||||||
|
\label{table:12}
|
||||||
|
\end{table}
|
||||||
|
\vspace{0.5cm}
|
||||||
|
|
||||||
|
\noindent The belief predictions task of the \textsc{Soloist} baseline utilizes \textsl{top-k} and \textsl{top-p} sampling in order to generate the \textsl{(slot, value)} pairs. As the baseline model uses open-ended generation, it is susceptible to generating random slot-value pairs that are not relevant. The baseline performance was also affected by the repeated slot generations and in some cases incorrect values. Table \ref{table:12} shows examples of some of the errors made by the baseline model. In the first example, the baseline system missed two true states and generated a totally incorrect belief state. For the second example, the slot \textit{area} is repeated with a different value and the value for the slot \textit{pricerange} is incorrectly generated.
|
||||||
|
|
||||||
|
\subsection{Analysis of prompt-based methods}
|
||||||
|
|
||||||
|
\subsubsection{Value-based Prompt}
|
||||||
|
\begin{table}[h!]
|
||||||
|
\centering
|
||||||
|
\begingroup
|
||||||
|
\setlength{\tabcolsep}{6pt} % Default value: 6pt
|
||||||
|
\renewcommand{\arraystretch}{1.25} % Default value: 1
|
||||||
|
\begin{tabular}{lp{10.2cm}}
|
||||||
|
\hline
|
||||||
|
\textbf{Dialog History} & \parbox{10.2cm}{
|
||||||
|
\vspace{.25\baselineskip}
|
||||||
|
\textsf{user:} I need to be picked up from pizza hut city centre after 04:30} \\
|
||||||
|
\textbf{True belief states} & \textsl{(departure, pizza hut city centre) (leave, 04:30)} \\
|
||||||
|
\textbf{Generated states} & \textsl{(destination, pizza hut city centre) (arrive, 04:30)} \\
|
||||||
|
\hline
|
||||||
|
\textbf{Dialog History} & \parbox{10.2cm}{
|
||||||
|
\vspace{.25\baselineskip}
|
||||||
|
\textsf{user:} I need a taxi to arrive by 16:45 to take me to the parkside police station.}\\
|
||||||
|
\textbf{True belief states} &\textsl{(destination, parkside police station) (leave, 16:45)}\\
|
||||||
|
\textbf{Generated states} &\textsl{(destination, parkside police station) (arrive, 16:45)}\\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\endgroup
|
||||||
|
\caption{Incorrect belief states generated by value-based prompt.}
|
||||||
|
\label{table:13}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
\noindent The value-based prompt trained on low-resource data splits (i.e., \textsl{5-dpd}, \textsl{10-dpd}) struggled to distinguish between the slots like \textit{departure} vs \textit{destination} and \textit{leave} vs \textit{arrive}. In many instances, it wrongly generated the slot \textit{destination} instead of \textit{departure} and slot \textit{arrive} instead of \textit{leave}. Table \ref{table:13} shows some example outputs where the slots are generated incorrectly. In both examples, the slot arrive is incorrectly generated. These incorrect slot generations are due to the limited training available for these examples. Overall, the prompt-based methods perform significantly better than the baseline even under low-resource settings, due to the constrained generation of slots using value-based prompts.
|
||||||
|
|
||||||
|
\subsubsection{Impact of Inverse Prompt}
|
||||||
|
The inverse prompt mechanism can be considered as an auxiliary task that complements the value-based prompt and helps generate the slots more accurately. Experiments are performed to analyze the impact of the inverse prompt by omitting it while training the value-based prompt.
|
||||||
|
|
||||||
|
\begin{table}[h!]
|
||||||
|
\centering
|
||||||
|
\begingroup
|
||||||
|
\setlength{\tabcolsep}{12pt} % Default value: 6pt
|
||||||
|
\renewcommand{\arraystretch}{1.2} % Default value: 1
|
||||||
|
\begin{tabular}{|l|cc|cc|}
|
||||||
|
\hline
|
||||||
|
\textbf{Data split} & \multicolumn{2}{c|}{\textbf{w/o inverse prompt}} & \multicolumn{2}{c|}{\textbf{with inverse prompt}} \\
|
||||||
|
\textbf{(\# dialogs)} & \textbf{JGA} & \textbf{JGA*} & \textbf{JGA} & \textbf{JGA*} \\
|
||||||
|
\hline
|
||||||
|
\textsl{5-dpd} (25) & 26.81 & 64.25 & 30.66 & 71.04 \\
|
||||||
|
\hline
|
||||||
|
\textsl{10-dpd} (50) & 41.1 & 82.35 & 42.65 & 86.43 \\
|
||||||
|
\hline
|
||||||
|
\textsl{50-dpd} (250) & 45.7 & 90.7 & 47.06 & 91.63 \\
|
||||||
|
\hline
|
||||||
|
\textsl{100-dpd} (500) & \textbf{47.74} & \textbf{91.86} & \textbf{47.74} & \textbf{92.31} \\
|
||||||
|
\hline
|
||||||
|
\textsl{125-dpd} (625) & 45.02 & 90.61 & 46.49 & 91.86 \\
|
||||||
|
\hline
|
||||||
|
\textsl{250-dpd} (1125) & 46.15 & 91.4 & 47.06 & 92.08 \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\endgroup
|
||||||
|
\caption{Experimental results showing value-based prompt performance with and without inverse prompt mechanism while training. The term \textquote{\textsl{dpd}} stands for \textquote{\textsl{dialogues per domain}}, \textquote{\textsl{w/o}} means \textquote{\textsl{without}}.}
|
||||||
|
\label{table:14}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
\noindent The results from table \ref{table:14} show the inverse prompt mechanism helped improve the performance of the prompt-based model, especially under low-resource data splits (i.e., \textsl{5-dpd}, \textsl{10-dpd}). For the data split \textsl{5-dpd}, where the training data is very limited, the inverse prompt brings noticeable improvements by achieving a \textit{5\%} increase in performance on JGA and a \textit{7\%} increase on JGA* metric. For the data splits with a higher number of data samples (i.e., \textsl{100-dpd}, \textsl{125-dpd}, \textsl{250-dpd}), only minor improvements can be observed in the performance when the inverse prompt is included in the training. The experimental results conclude that the inverse prompt mechanism has a noticeable impact on the prompt-based model under extremely low-resource settings.
|
||||||
|
|
||||||
|
\subsubsection{Repeated values in Belief States}
|
||||||
|
In the prompt-based methods, the value-based prompt takes the candidate values and generates the corresponding slots. The belief states can have repeated values in the (slot, value) pairs. In other words, the user requirements may lead to having repeated values in the belief state (slot, value) pairs.
|
||||||
|
|
||||||
|
\begin{table}[h!]
|
||||||
|
\centering
|
||||||
|
\begingroup
|
||||||
|
\setlength{\tabcolsep}{5pt} % Default value: 6pt
|
||||||
|
\renewcommand{\arraystretch}{1.2} % Default value: 1
|
||||||
|
\begin{tabular}{lp{11.25cm}}
|
||||||
|
\hline
|
||||||
|
\textbf{History} & \parbox{11.25cm}{
|
||||||
|
\vspace{.25\baselineskip}
|
||||||
|
\textsf{user:} hi, can you help me find a 3 star place to stay?\newline
|
||||||
|
\textsf{system:} Is there a particular area or price range you prefer?\newline
|
||||||
|
\textsf{user:} how about a place in centre of town that is of type hotel.\newline
|
||||||
|
\textsf{system:} how long would you like to stay, and how many people?\newline
|
||||||
|
\textsf{user:} I'll arrive on saturday and stay for 3 nights with 3 people.} \\
|
||||||
|
\textbf{True states} & \textsl{(area, centre) (stars, 3) (type, hotel) (day, saturday) (stay, 3) (people, 3)} \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\endgroup
|
||||||
|
\caption{An example instance with repeated values in the (slot, value) pairs}
|
||||||
|
\label{table:15}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
\noindent The data instance listed in table \ref{table:15} contains multiple (slot, value) pairs. For the belief slots \textsl{stars}, \textsl{stay}, and \textsl{people}, the value is the same. The value-based prompt can only generate one slot with the repeated value \textit{3}. This is a main drawback of the value-based prompt under the existing belief state annotation system.
|
||||||
|
|
||||||
|
|
||||||
|
\subsubsection{Error Analysis of Value Extraction} \label{subsec:value_errors}
|
||||||
|
|
||||||
|
\begin{table}[h!]
|
||||||
|
\centering
|
||||||
|
\begingroup
|
||||||
|
\setlength{\tabcolsep}{6pt} % Default value: 6pt
|
||||||
|
\renewcommand{\arraystretch}{1.1} % Default value: 1
|
||||||
|
\begin{tabular}{lp{11.25cm}}
|
||||||
|
\hline
|
||||||
|
\textbf{History} & \parbox{11.25cm}{
|
||||||
|
\vspace{.25\baselineskip}
|
||||||
|
\textsf{user:} I want a place to stay that has free wifi and free parking.\newline
|
||||||
|
\textsf{system:} do you have a preference for area or price range?\newline
|
||||||
|
\textsf{user:} I don't have a preference. I want a hotel not guesthouse.}\\
|
||||||
|
\textbf{True states} & \textsl{(area, \underline{dont care}) (internet, \underline{yes}) (parking, \underline{yes}) (price, \underline{dont care}) (type, hotel)} \\
|
||||||
|
\textbf{\makecell[l]{Extracted\\values}} & \textsl{free}, \textsl{hotel} \\
|
||||||
|
\hline
|
||||||
|
\textbf{History} & \parbox{11.25cm}{
|
||||||
|
\vspace{.25\baselineskip}
|
||||||
|
\textsf{user:} I need a guesthouse with free wifi please.\newline
|
||||||
|
\textsf{system:} which area would you prefer?\newline
|
||||||
|
\textsf{user:} I also need free parking, and I prefer a 4 star place.}\\
|
||||||
|
\textbf{True states} & \textsl{(internet, \underline{yes}) (parking, \underline{yes}) (stars, 4) (type, guesthouse)} \\
|
||||||
|
\textbf{\makecell[l]{Extracted\\values}} & \textsl{free}, \textsl{guesthouse}, \textsl{4} \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\endgroup
|
||||||
|
\caption{Example data instances where values cannot be extracted (underlined).}
|
||||||
|
\label{table:16}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
At inference time, the value-based prompt requires the belief state values in order to generate slots. The value extraction methods apply a set of rules on POS tags and named entities to extract value candidates directly from utterances. The rule-based extraction has an accuracy of \textit{79\%} on the test split. Table \ref{table:16} highlights instances where the values cannot be extracted using rule-based methods. In the first example, the value \textquote{\textit{dont care}} does not appear in the utterances and cannot be extracted from POS tags. When the user requirement is \textit{free} wifi or \textit{free} parking, the existing annotation system for belief states considers it as the value \textquote{\textit{yes}}. The rule-based methods adopted for value extraction can only extract the value \textquote{\textit{free}} from the utterances. The values \textquote{\textit{dont care}} and \textquote{\textit{yes}} also occur twice in the examples shown in table \ref{table:16}, as described in the previous section (sec \ref{subsec:value_errors}) the value-based prompt cannot handle repeated values for slot generation.
|
||||||
|
|
||||||
|
\vspace{0.5cm}
|
||||||
|
\begin{table}[h!]
|
||||||
|
\centering
|
||||||
|
\begingroup
|
||||||
|
\setlength{\tabcolsep}{8pt} % Default value: 6pt
|
||||||
|
\renewcommand{\arraystretch}{1.25} % Default value: 1
|
||||||
|
\begin{tabular}{lp{10cm}}
|
||||||
|
\hline
|
||||||
|
\textbf{History} & \parbox{10cm}{
|
||||||
|
\vspace{.3\baselineskip}
|
||||||
|
\textsf{user:} I kind of need some help finding a nice hotel in the north part of town.
|
||||||
|
}\\
|
||||||
|
\textbf{True states} & \textsl{(area, north) (price, expensive) (type, hotel)} \\
|
||||||
|
\textbf{Extracted values} & \textsl{\underline{kind}}, \textsl{\underline{nice}}, \textsl{hotel}, \textsl{north} \\
|
||||||
|
\hline
|
||||||
|
%\textbf{History} & \parbox{11.25cm}{
|
||||||
|
%\vspace{.25\baselineskip}
|
||||||
|
%\textsf{user:} Hi, are there any expensive restaurants in the city centre?.\newline
|
||||||
|
%\textsf{system:} Is there a particular type of food you are looking for?\newline
|
||||||
|
%\textsf{user:} No, can you choose one for me and provide me the address.}\\
|
||||||
|
%\textbf{True states} & \textsl{(area, centre) (price, expensive) (food, dont care)}\\
|
||||||
|
%\textbf{Extracted values} & \textsl{expensive}, \textsl{centre}, \textsl{1} \\
|
||||||
|
%\hline
|
||||||
|
\end{tabular}
|
||||||
|
\endgroup
|
||||||
|
\caption{Example instance where values are extracted incorrectly (underlined).}
|
||||||
|
\label{table:17}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
\paragraph{} After extracting POS tags using the CoreNLP client, all the \textsl{adjectives} and \textsl{adverbs} from the utterances are considered as candidate values. This approach can lead to false positives in value candidates. Table \ref{table:17} shows a data instance where some values are extracted incorrectly. The existing annotation system associates the user utterance \textquote{a nice hotel} with the value \textquote{\textit{expensive}} for slot price, this cannot be achieved under the current rule-based methods. The value \textquote{\textit{kind}} is also extracted incorrectly due to considering all the \textsl{adverbs} as possible values. The rule-based value extraction methods used in this thesis have limitations, which led to the performance degradation of prompt-based DST.
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,3 +1,3 @@
|
|||||||
\section{Conclusion}\label{sec:conclusion}
|
\section{Conclusion}\label{sec:conclusion}
|
||||||
|
|
||||||
\paragraph{}
|
This work explored the use of prompt-based methods for dialog state tracking (DST) in task-oriented dialogue systems. The prompt-based methods, which include value-based prompt and inverse prompt, learned the DST task efficiently under low-resource few-shot settings without relying on the pre-defined set of slots and values. Experiments show that the prompt-based methods significantly outperformed the baseline Soloist model under low-resource settings. Analysis of generated belief states shows the prompt-based approach has some limitations. Additionally, multi-prompt methods such as prompt ensembling and prompt augmentation are applied to the DST task. Results show that the prompt ensemble model achieved minor improvements, and the performance of prompt augmentation is limited due to the bias in answered prompts. Error analysis of value extraction highlights the limitations of the rule-based methods. Further research is necessary to overcome the limitations of value extraction methods.
|
||||||
Binary file not shown.
Loading…
Reference in new issue