@article{yang2022prompt,
  author    = {Yuting Yang and
               Wenqiang Lei and
               Juan Cao and
               Jintao Li and
               Tat{-}Seng Chua},
  title     = {Prompt Learning for Few-Shot Dialogue State Tracking},
  journal   = {CoRR},
  volume    = {abs/2201.05780},
  year      = {2022},
  url       = {https://arxiv.org/abs/2201.05780},
  eprinttype = {arXiv},
  eprint    = {2201.05780},
  timestamp = {Mon, 18 Jul 2022 13:17:40 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2201-05780.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{liu2021ppp,
  author    = {Pengfei Liu and
               Weizhe Yuan and
               Jinlan Fu and
               Zhengbao Jiang and
               Hiroaki Hayashi and
               Graham Neubig},
  title     = {Pre-train, Prompt, and Predict: {A} Systematic Survey of Prompting
               Methods in Natural Language Processing},
  journal   = {CoRR},
  volume    = {abs/2107.13586},
  year      = {2021},
  url       = {https://arxiv.org/abs/2107.13586},
  eprinttype = {arXiv},
  eprint    = {2107.13586},
  timestamp = {Tue, 03 Aug 2021 14:53:34 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2107-13586.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{brown2020gpt3,
 author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {H. Larochelle and M. Ranzato and R. Hadsell and M.F. Balcan and H. Lin},
 pages = {1877--1901},
 publisher = {Curran Associates, Inc.},
 title = {Language Models are Few-Shot Learners},
 url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
 volume = {33},
 year = {2020}
}
@article{madotto2021fsb,
  author    = {Andrea Madotto and
               Zhaojiang Lin and
               Genta Indra Winata and
               Pascale Fung},
  title     = {Few-Shot Bot: Prompt-Based Learning for Dialogue Systems},
  journal   = {CoRR},
  volume    = {abs/2110.08118},
  year      = {2021},
  url       = {https://arxiv.org/abs/2110.08118},
  eprinttype = {arXiv},
  eprint    = {2110.08118},
  timestamp = {Fri, 22 Oct 2021 13:33:09 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2110-08118.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{radford2018gpt,
  title={Improving language understanding by generative pre-training},
  author={Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya and others},
  year={2018},
  publisher={OpenAI}
}
@article{radford2019gpt2,
  title={Language models are unsupervised multitask learners},
  author={Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya and others},
  journal={OpenAI blog},
  volume={1},
  number={8},
  pages={9},
  year={2019}
}
@inproceedings{devlin2019bert,
    title = "{BERT}: Pre-training of Deep Bidirectional Transformers for Language Understanding",
    author = "Devlin, Jacob  and
      Chang, Ming-Wei  and
      Lee, Kenton  and
      Toutanova, Kristina",
    booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
    month = jun,
    year = "2019",
    address = "Minneapolis, Minnesota",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/N19-1423",
    doi = "10.18653/v1/N19-1423",
    pages = "4171--4186",
    abstract = "We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5 (7.7 point absolute improvement), MultiNLI accuracy to 86.7{\%} (4.6{\%} absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).",
}
@article{peng2021soloist,
    title = "{SOLOIST:} Building Task Bots at Scale with Transfer Learning and Machine Teaching",
    author = "Peng, Baolin  and
      Li, Chunyuan  and
      Li, Jinchao  and
      Shayandeh, Shahin  and
      Liden, Lars  and
      Gao, Jianfeng",
    journal = "Transactions of the Association for Computational Linguistics",
    volume = "9",
    year = "2021",
    address = "Cambridge, MA",
    publisher = "MIT Press",
    url = "https://aclanthology.org/2021.tacl-1.49",
    doi = "10.1162/tacl_a_00399",
    pages = "807--824",
    abstract = "Abstract We present a new method, Soloist,1 that uses transfer learning and machine teaching to build task bots at scale. We parameterize classical modular task-oriented dialog systems using a Transformer-based auto-regressive language model, which subsumes different dialog modules into a single neural model. We pre-train, on heterogeneous dialog corpora, a task-grounded response generation model, which can generate dialog responses grounded in user goals and real-world knowledge for task completion. The pre-trained model can be efficiently adapted to accomplish new tasks with a handful of task-specific dialogs via machine teaching, where training samples are generated by human teachers interacting with the system. Experiments show that (i)Soloist creates new state-of-the-art on well-studied task-oriented dialog benchmarks, including CamRest676 and MultiWOZ; (ii) in the few-shot fine-tuning settings, Soloist significantly outperforms existing methods; and (iii) the use of machine teaching substantially reduces the labeling cost of fine-tuning. The pre-trained models and codes are available at https://aka.ms/soloist.",
}
@article{lee2021sdp,
  author    = {Chia{-}Hsuan Lee and
               Hao Cheng and
               Mari Ostendorf},
  title     = {Dialogue State Tracking with a Language Model using Schema-Driven
               Prompting},
  journal   = {CoRR},
  volume    = {abs/2109.07506},
  year      = {2021},
  url       = {https://arxiv.org/abs/2109.07506},
  eprinttype = {arXiv},
  eprint    = {2109.07506},
  timestamp = {Wed, 03 Nov 2021 08:48:34 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2109-07506.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{eric2019multiwoz,
  author    = {Mihail Eric and
               Rahul Goel and
               Shachi Paul and
               Abhishek Sethi and
               Sanchit Agarwal and
               Shuyang Gao and
               Dilek Hakkani{-}T{\"{u}}r},
  title     = {MultiWOZ 2.1: Multi-Domain Dialogue State Corrections and State Tracking Baselines},
  journal   = {CoRR},
  volume    = {abs/1907.01669},
  year      = {2019},
  url       = {http://arxiv.org/abs/1907.01669},
  eprinttype = {arXiv},
  eprint    = {1907.01669},
  timestamp = {Fri, 09 Aug 2019 10:00:01 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1907-01669.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{budzianowski2018multiwoz,
    title = "{M}ulti{WOZ} - A Large-Scale Multi-Domain {W}izard-of-{O}z Dataset for Task-Oriented Dialogue Modelling",
    author = "Budzianowski, Pawe{\l}  and
      Wen, Tsung-Hsien  and
      Tseng, Bo-Hsiang  and
      Casanueva, I{\~n}igo  and
      Ultes, Stefan  and
      Ramadan, Osman  and
      Ga{\v{s}}i{\'c}, Milica",
    booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
    month = oct # "-" # nov,
    year = "2018",
    address = "Brussels, Belgium",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/D18-1547",
    doi = "10.18653/v1/D18-1547",
    pages = "5016--5026",
    abstract = "Even though machine learning has become the major scene in dialogue research community, the real breakthrough has been blocked by the scale of data available.To address this fundamental obstacle, we introduce the Multi-Domain Wizard-of-Oz dataset (MultiWOZ), a fully-labeled collection of human-human written conversations spanning over multiple domains and topics.At a size of 10k dialogues, it is at least one order of magnitude larger than all previous annotated task-oriented corpora.The contribution of this work apart from the open-sourced dataset is two-fold:firstly, a detailed description of the data collection procedure along with a summary of data structure and analysis is provided. The proposed data-collection pipeline is entirely based on crowd-sourcing without the need of hiring professional annotators;secondly, a set of benchmark results of belief tracking, dialogue act and response generation is reported, which shows the usability of the data and sets a baseline for future studies.",
}
@inproceedings{min2020dsi,
  title     = {Dialogue State Induction Using Neural Latent Variable Models},
  author    = {Min, Qingkai and Qin, Libo and Teng, Zhiyang and Liu, Xiao and Zhang, Yue},
  booktitle = {Proceedings of the Twenty-Ninth International Joint Conference on
               Artificial Intelligence, {IJCAI-20}},
  publisher = {International Joint Conferences on Artificial Intelligence Organization},
  editor    = {Christian Bessiere},
  pages     = {3845--3852},
  year      = {2020},
  month     = {7},
  note      = {Main track},
  doi       = {10.24963/ijcai.2020/532},
  url       = {https://doi.org/10.24963/ijcai.2020/532},
}
@inproceedings{gao2021lmbff,
    title = "Making Pre-trained Language Models Better Few-shot Learners",
    author = "Gao, Tianyu  and
      Fisch, Adam  and
      Chen, Danqi",
    booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
    month = aug,
    year = "2021",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.acl-long.295",
    doi = "10.18653/v1/2021.acl-long.295",
    pages = "3816--3830",
    abstract = "The recent GPT-3 model (Brown et al., 2020) achieves remarkable few-shot performance solely by leveraging a natural-language prompt and a few task demonstrations as input context. Inspired by their findings, we study few-shot learning in a more practical scenario, where we use smaller language models for which fine-tuning is computationally efficient. We present LM-BFF{---}better few-shot fine-tuning of language models{---}a suite of simple and complementary techniques for fine-tuning language models on a small number of annotated examples. Our approach includes (1) prompt-based fine-tuning together with a novel pipeline for automating prompt generation; and (2) a refined strategy for dynamically and selectively incorporating demonstrations into each context. Finally, we present a systematic evaluation for analyzing few-shot performance on a range of NLP tasks, including classification and regression. Our experiments demonstrate that our methods combine to dramatically outperform standard fine-tuning procedures in this low resource setting, achieving up to 30{\%} absolute improvement, and 11{\%} on average across all tasks. Our approach makes minimal assumptions on task resources and domain expertise, and hence constitutes a strong task-agnostic method for few-shot learning.",
}
@inproceedings{cui2021template,
    title = "Template-Based Named Entity Recognition Using {BART}",
    author = "Cui, Leyang  and
      Wu, Yu  and
      Liu, Jian  and
      Yang, Sen  and
      Zhang, Yue",
    booktitle = "Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021",
    month = aug,
    year = "2021",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.findings-acl.161",
    doi = "10.18653/v1/2021.findings-acl.161",
    pages = "1835--1845",
}
@inproceedings{schick2021pet,
    title = "Few-Shot Text Generation with Natural Language Instructions",
    author = {Schick, Timo  and Sch{\"u}tze, Hinrich},
    booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
    month = nov,
    year = "2021",
    address = "Online and Punta Cana, Dominican Republic",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.emnlp-main.32",
    doi = "10.18653/v1/2021.emnlp-main.32",
    pages = "390--402",
    abstract = "Providing pretrained language models with simple task descriptions in natural language enables them to solve some tasks in a fully unsupervised fashion. Moreover, when combined with regular learning from examples, this idea yields impressive few-shot results for a wide range of text classification tasks. It is also a promising direction to improve data efficiency in generative settings, but there are several challenges to using a combination of task descriptions and example-based learning for text generation. In particular, it is crucial to find task descriptions that are easy to understand for the pretrained model and to ensure that it actually makes good use of them; furthermore, effective measures against overfitting have to be implemented. In this paper, we show how these challenges can be tackled: We introduce GenPET, a method for text generation that is based on pattern-exploiting training, a recent approach for combining textual instructions with supervised learning that only works for classification tasks. On several summarization and headline generation datasets, GenPET gives consistent improvements over strong baselines in few-shot settings.",
}
@inproceedings{li2021coco,
title={CoCo: Controllable Counterfactuals for Evaluating Dialogue State Trackers},
author={Shiyang Li and Semih Yavuz and Kazuma Hashimoto and Jia Li and Tong Niu and Nazneen Rajani and Xifeng Yan and Yingbo Zhou and Caiming Xiong},
booktitle={International Conference on Learning Representations},
year={2021},
url={https://openreview.net/forum?id=eom0IUrF__F}
}
@inproceedings{vaswani2017attention,
 author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, \L ukasz and Polosukhin, Illia},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {I. Guyon and U. Von Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett},
 pages = {},
 publisher = {Curran Associates, Inc.},
 title = {Attention is All you Need},
 url = {https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf},
 volume = {30},
 year = {2017}
}
@inproceedings{holtzman2020topp,
title={The Curious Case of Neural Text Degeneration},
author={Ari Holtzman and Jan Buys and Li Du and Maxwell Forbes and Yejin Choi},
booktitle={International Conference on Learning Representations},
year={2020},
url={https://openreview.net/forum?id=rygGQyrFvH}
}
@inproceedings{fan2018topk,
    title = "Hierarchical Neural Story Generation",
    author = "Fan, Angela  and
      Lewis, Mike  and
      Dauphin, Yann",
    booktitle = "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = jul,
    year = "2018",
    address = "Melbourne, Australia",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/P18-1082",
    doi = "10.18653/v1/P18-1082",
    pages = "889--898",
    abstract = "We explore story generation: creative systems that can build coherent and fluent passages of text about a topic. We collect a large dataset of 300K human-written stories paired with writing prompts from an online forum. Our dataset enables hierarchical story generation, where the model first generates a premise, and then transforms it into a passage of text. We gain further improvements with a novel form of model fusion that improves the relevance of the story to the prompt, and adding a new gated multi-scale self-attention mechanism to model long-range context. Experiments show large improvements over strong baselines on both automated and human evaluations. Human judges prefer stories generated by our approach to those from a strong non-hierarchical model by a factor of two to one.",
}
@inproceedings{qi2020stanza,
    title = "{S}tanza: A Python Natural Language Processing Toolkit for Many Human Languages",
    author = "Qi, Peng  and
      Zhang, Yuhao  and
      Zhang, Yuhui  and
      Bolton, Jason  and
      Manning, Christopher D.",
    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: System Demonstrations",
    month = jul,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2020.acl-demos.14",
    doi = "10.18653/v1/2020.acl-demos.14",
    pages = "101--108",
    abstract = "We introduce Stanza, an open-source Python natural language processing toolkit supporting 66 human languages. Compared to existing widely used toolkits, Stanza features a language-agnostic fully neural pipeline for text analysis, including tokenization, multi-word token expansion, lemmatization, part-of-speech and morphological feature tagging, dependency parsing, and named entity recognition. We have trained Stanza on a total of 112 datasets, including the Universal Dependencies treebanks and other multilingual corpora, and show that the same neural architecture generalizes well and achieves competitive performance on all languages tested. Additionally, Stanza includes a native Python interface to the widely used Java Stanford CoreNLP software, which further extends its functionality to cover other tasks such as coreference resolution and relation extraction. Source code, documentation, and pretrained models for 66 languages are available at https://stanfordnlp.github.io/stanza/.",
}
@inproceedings{wolf2020transformers,
    title = "Transformers: State-of-the-Art Natural Language Processing",
    author = "Wolf, Thomas  and
      Debut, Lysandre  and
      Sanh, Victor  and
      Chaumond, Julien  and
      Delangue, Clement  and
      Moi, Anthony  and
      Cistac, Pierric  and
      Rault, Tim  and
      Louf, Remi  and
      Funtowicz, Morgan  and
      Davison, Joe  and
      Shleifer, Sam  and
      von Platen, Patrick  and
      Ma, Clara  and
      Jernite, Yacine  and
      Plu, Julien  and
      Xu, Canwen  and
      Le Scao, Teven  and
      Gugger, Sylvain  and
      Drame, Mariama  and
      Lhoest, Quentin  and
      Rush, Alexander",
    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
    month = oct,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2020.emnlp-demos.6",
    doi = "10.18653/v1/2020.emnlp-demos.6",
    pages = "38--45",
    abstract = "Recent progress in natural language processing has been driven by advances in both model architecture and model pretraining. Transformer architectures have facilitated building higher-capacity models and pretraining has made it possible to effectively utilize this capacity for a wide variety of tasks. Transformers is an open-source library with the goal of opening up these advances to the wider machine learning community. The library consists of carefully engineered state-of-the art Transformer architectures under a unified API. Backing this library is a curated collection of pretrained models made by and available for the community. Transformers is designed to be extensible by researchers, simple for practitioners, and fast and robust in industrial deployments. The library is available at https://github.com/huggingface/transformers.",
}
@inproceedings{kingma2015adam,
  author    = {Diederik P. Kingma and
               Jimmy Ba},
  editor    = {Yoshua Bengio and
               Yann LeCun},
  title     = {Adam: {A} Method for Stochastic Optimization},
  booktitle = {3rd International Conference on Learning Representations, {ICLR} 2015,
               San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings},
  year      = {2015},
  url       = {http://arxiv.org/abs/1412.6980}
}
@article{ni2021dlds,
  author    = {Jinjie Ni and
               Tom Young and
               Vlad Pandelea and
               Fuzhao Xue and
               Vinay Adiga and
               Erik Cambria},
  title     = {Recent Advances in Deep Learning Based Dialogue Systems: {A} Systematic
               Survey},
  journal   = {CoRR},
  volume    = {abs/2105.04387},
  year      = {2021},
  url       = {https://arxiv.org/abs/2105.04387},
  eprinttype = {arXiv},
  eprint    = {2105.04387},
  timestamp = {Mon, 31 May 2021 08:19:46 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2105-04387.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{wu2020tod-bert,
    title = "{TOD}-{BERT}: Pre-trained Natural Language Understanding for Task-Oriented Dialogue",
    author = "Wu, Chien-Sheng  and
      Hoi, Steven C.H.  and
      Socher, Richard  and
      Xiong, Caiming",
    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
    month = nov,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2020.emnlp-main.66",
    doi = "10.18653/v1/2020.emnlp-main.66",
    pages = "917--929",
    abstract = "The underlying difference of linguistic patterns between general text and task-oriented dialogue makes existing pre-trained language models less useful in practice. In this work, we unify nine human-human and multi-turn task-oriented dialogue datasets for language modeling. To better model dialogue behavior during pre-training, we incorporate user and system tokens into the masked language modeling. We propose a contrastive objective function to simulate the response selection task. Our pre-trained task-oriented dialogue BERT (TOD-BERT) outperforms strong baselines like BERT on four downstream task-oriented dialogue applications, including intention recognition, dialogue state tracking, dialogue act prediction, and response selection. We also show that TOD-BERT has a stronger few-shot ability that can mitigate the data scarcity problem for task-oriented dialogue.",
}