@article{yang2022prompt, author = {Yuting Yang and Wenqiang Lei and Juan Cao and Jintao Li and Tat{-}Seng Chua}, title = {Prompt Learning for Few-Shot Dialogue State Tracking}, journal = {CoRR}, volume = {abs/2201.05780}, year = {2022}, url = {https://arxiv.org/abs/2201.05780}, eprinttype = {arXiv}, eprint = {2201.05780}, timestamp = {Mon, 18 Jul 2022 13:17:40 +0200}, biburl = {https://dblp.org/rec/journals/corr/abs-2201-05780.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} } @article{liu2021ppp, author = {Pengfei Liu and Weizhe Yuan and Jinlan Fu and Zhengbao Jiang and Hiroaki Hayashi and Graham Neubig}, title = {Pre-train, Prompt, and Predict: {A} Systematic Survey of Prompting Methods in Natural Language Processing}, journal = {CoRR}, volume = {abs/2107.13586}, year = {2021}, url = {https://arxiv.org/abs/2107.13586}, eprinttype = {arXiv}, eprint = {2107.13586}, timestamp = {Tue, 03 Aug 2021 14:53:34 +0200}, biburl = {https://dblp.org/rec/journals/corr/abs-2107-13586.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} } @inproceedings{brown2020gpt3, author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario}, booktitle = {Advances in Neural Information Processing Systems}, editor = {H. Larochelle and M. Ranzato and R. Hadsell and M.F. Balcan and H. Lin}, pages = {1877--1901}, publisher = {Curran Associates, Inc.}, title = {Language Models are Few-Shot Learners}, url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf}, volume = {33}, year = {2020} } @article{madotto2021fsb, author = {Andrea Madotto and Zhaojiang Lin and Genta Indra Winata and Pascale Fung}, title = {Few-Shot Bot: Prompt-Based Learning for Dialogue Systems}, journal = {CoRR}, volume = {abs/2110.08118}, year = {2021}, url = {https://arxiv.org/abs/2110.08118}, eprinttype = {arXiv}, eprint = {2110.08118}, timestamp = {Fri, 22 Oct 2021 13:33:09 +0200}, biburl = {https://dblp.org/rec/journals/corr/abs-2110-08118.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} } @article{radford2018gpt, title={Improving language understanding by generative pre-training}, author={Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya and others}, year={2018}, publisher={OpenAI} } @article{radford2019gpt2, title={Language models are unsupervised multitask learners}, author={Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya and others}, journal={OpenAI blog}, volume={1}, number={8}, pages={9}, year={2019} } @inproceedings{devlin2019bert, title = "{BERT}: Pre-training of Deep Bidirectional Transformers for Language Understanding", author = "Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina", booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)", month = jun, year = "2019", address = "Minneapolis, Minnesota", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/N19-1423", doi = "10.18653/v1/N19-1423", pages = "4171--4186", abstract = "We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5 (7.7 point absolute improvement), MultiNLI accuracy to 86.7{\%} (4.6{\%} absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).", } @article{peng2021soloist, title = "{SOLOIST:} Building Task Bots at Scale with Transfer Learning and Machine Teaching", author = "Peng, Baolin and Li, Chunyuan and Li, Jinchao and Shayandeh, Shahin and Liden, Lars and Gao, Jianfeng", journal = "Transactions of the Association for Computational Linguistics", volume = "9", year = "2021", address = "Cambridge, MA", publisher = "MIT Press", url = "https://aclanthology.org/2021.tacl-1.49", doi = "10.1162/tacl_a_00399", pages = "807--824", abstract = "Abstract We present a new method, Soloist,1 that uses transfer learning and machine teaching to build task bots at scale. We parameterize classical modular task-oriented dialog systems using a Transformer-based auto-regressive language model, which subsumes different dialog modules into a single neural model. We pre-train, on heterogeneous dialog corpora, a task-grounded response generation model, which can generate dialog responses grounded in user goals and real-world knowledge for task completion. The pre-trained model can be efficiently adapted to accomplish new tasks with a handful of task-specific dialogs via machine teaching, where training samples are generated by human teachers interacting with the system. Experiments show that (i)Soloist creates new state-of-the-art on well-studied task-oriented dialog benchmarks, including CamRest676 and MultiWOZ; (ii) in the few-shot fine-tuning settings, Soloist significantly outperforms existing methods; and (iii) the use of machine teaching substantially reduces the labeling cost of fine-tuning. The pre-trained models and codes are available at https://aka.ms/soloist.", } @article{lee2021sdp, author = {Chia{-}Hsuan Lee and Hao Cheng and Mari Ostendorf}, title = {Dialogue State Tracking with a Language Model using Schema-Driven Prompting}, journal = {CoRR}, volume = {abs/2109.07506}, year = {2021}, url = {https://arxiv.org/abs/2109.07506}, eprinttype = {arXiv}, eprint = {2109.07506}, timestamp = {Wed, 03 Nov 2021 08:48:34 +0100}, biburl = {https://dblp.org/rec/journals/corr/abs-2109-07506.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} } @article{eric2019multiwoz, author = {Mihail Eric and Rahul Goel and Shachi Paul and Abhishek Sethi and Sanchit Agarwal and Shuyang Gao and Dilek Hakkani{-}T{\"{u}}r}, title = {MultiWOZ 2.1: Multi-Domain Dialogue State Corrections and State Tracking Baselines}, journal = {CoRR}, volume = {abs/1907.01669}, year = {2019}, url = {http://arxiv.org/abs/1907.01669}, eprinttype = {arXiv}, eprint = {1907.01669}, timestamp = {Fri, 09 Aug 2019 10:00:01 +0200}, biburl = {https://dblp.org/rec/journals/corr/abs-1907-01669.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} } @inproceedings{budzianowski2018multiwoz, title = "{M}ulti{WOZ} - A Large-Scale Multi-Domain {W}izard-of-{O}z Dataset for Task-Oriented Dialogue Modelling", author = "Budzianowski, Pawe{\l} and Wen, Tsung-Hsien and Tseng, Bo-Hsiang and Casanueva, I{\~n}igo and Ultes, Stefan and Ramadan, Osman and Ga{\v{s}}i{\'c}, Milica", booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing", month = oct # "-" # nov, year = "2018", address = "Brussels, Belgium", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/D18-1547", doi = "10.18653/v1/D18-1547", pages = "5016--5026", abstract = "Even though machine learning has become the major scene in dialogue research community, the real breakthrough has been blocked by the scale of data available.To address this fundamental obstacle, we introduce the Multi-Domain Wizard-of-Oz dataset (MultiWOZ), a fully-labeled collection of human-human written conversations spanning over multiple domains and topics.At a size of 10k dialogues, it is at least one order of magnitude larger than all previous annotated task-oriented corpora.The contribution of this work apart from the open-sourced dataset is two-fold:firstly, a detailed description of the data collection procedure along with a summary of data structure and analysis is provided. The proposed data-collection pipeline is entirely based on crowd-sourcing without the need of hiring professional annotators;secondly, a set of benchmark results of belief tracking, dialogue act and response generation is reported, which shows the usability of the data and sets a baseline for future studies.", } @inproceedings{min2020dsi, title = {Dialogue State Induction Using Neural Latent Variable Models}, author = {Min, Qingkai and Qin, Libo and Teng, Zhiyang and Liu, Xiao and Zhang, Yue}, booktitle = {Proceedings of the Twenty-Ninth International Joint Conference on Artificial Intelligence, {IJCAI-20}}, publisher = {International Joint Conferences on Artificial Intelligence Organization}, editor = {Christian Bessiere}, pages = {3845--3852}, year = {2020}, month = {7}, note = {Main track}, doi = {10.24963/ijcai.2020/532}, url = {https://doi.org/10.24963/ijcai.2020/532}, } @inproceedings{gao2021lmbff, title = "Making Pre-trained Language Models Better Few-shot Learners", author = "Gao, Tianyu and Fisch, Adam and Chen, Danqi", booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)", month = aug, year = "2021", address = "Online", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2021.acl-long.295", doi = "10.18653/v1/2021.acl-long.295", pages = "3816--3830", abstract = "The recent GPT-3 model (Brown et al., 2020) achieves remarkable few-shot performance solely by leveraging a natural-language prompt and a few task demonstrations as input context. Inspired by their findings, we study few-shot learning in a more practical scenario, where we use smaller language models for which fine-tuning is computationally efficient. We present LM-BFF{---}better few-shot fine-tuning of language models{---}a suite of simple and complementary techniques for fine-tuning language models on a small number of annotated examples. Our approach includes (1) prompt-based fine-tuning together with a novel pipeline for automating prompt generation; and (2) a refined strategy for dynamically and selectively incorporating demonstrations into each context. Finally, we present a systematic evaluation for analyzing few-shot performance on a range of NLP tasks, including classification and regression. Our experiments demonstrate that our methods combine to dramatically outperform standard fine-tuning procedures in this low resource setting, achieving up to 30{\%} absolute improvement, and 11{\%} on average across all tasks. Our approach makes minimal assumptions on task resources and domain expertise, and hence constitutes a strong task-agnostic method for few-shot learning.", } @inproceedings{cui2021template, title = "Template-Based Named Entity Recognition Using {BART}", author = "Cui, Leyang and Wu, Yu and Liu, Jian and Yang, Sen and Zhang, Yue", booktitle = "Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021", month = aug, year = "2021", address = "Online", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2021.findings-acl.161", doi = "10.18653/v1/2021.findings-acl.161", pages = "1835--1845", } @inproceedings{schick2021pet, title = "Few-Shot Text Generation with Natural Language Instructions", author = {Schick, Timo and Sch{\"u}tze, Hinrich}, booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", month = nov, year = "2021", address = "Online and Punta Cana, Dominican Republic", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2021.emnlp-main.32", doi = "10.18653/v1/2021.emnlp-main.32", pages = "390--402", abstract = "Providing pretrained language models with simple task descriptions in natural language enables them to solve some tasks in a fully unsupervised fashion. Moreover, when combined with regular learning from examples, this idea yields impressive few-shot results for a wide range of text classification tasks. It is also a promising direction to improve data efficiency in generative settings, but there are several challenges to using a combination of task descriptions and example-based learning for text generation. In particular, it is crucial to find task descriptions that are easy to understand for the pretrained model and to ensure that it actually makes good use of them; furthermore, effective measures against overfitting have to be implemented. In this paper, we show how these challenges can be tackled: We introduce GenPET, a method for text generation that is based on pattern-exploiting training, a recent approach for combining textual instructions with supervised learning that only works for classification tasks. On several summarization and headline generation datasets, GenPET gives consistent improvements over strong baselines in few-shot settings.", } @inproceedings{li2021coco, title={CoCo: Controllable Counterfactuals for Evaluating Dialogue State Trackers}, author={Shiyang Li and Semih Yavuz and Kazuma Hashimoto and Jia Li and Tong Niu and Nazneen Rajani and Xifeng Yan and Yingbo Zhou and Caiming Xiong}, booktitle={International Conference on Learning Representations}, year={2021}, url={https://openreview.net/forum?id=eom0IUrF__F} } @inproceedings{vaswani2017attention, author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, \L ukasz and Polosukhin, Illia}, booktitle = {Advances in Neural Information Processing Systems}, editor = {I. Guyon and U. Von Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett}, pages = {}, publisher = {Curran Associates, Inc.}, title = {Attention is All you Need}, url = {https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf}, volume = {30}, year = {2017} } @inproceedings{holtzman2020topp, title={The Curious Case of Neural Text Degeneration}, author={Ari Holtzman and Jan Buys and Li Du and Maxwell Forbes and Yejin Choi}, booktitle={International Conference on Learning Representations}, year={2020}, url={https://openreview.net/forum?id=rygGQyrFvH} } @inproceedings{fan2018topk, title = "Hierarchical Neural Story Generation", author = "Fan, Angela and Lewis, Mike and Dauphin, Yann", booktitle = "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", month = jul, year = "2018", address = "Melbourne, Australia", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/P18-1082", doi = "10.18653/v1/P18-1082", pages = "889--898", abstract = "We explore story generation: creative systems that can build coherent and fluent passages of text about a topic. We collect a large dataset of 300K human-written stories paired with writing prompts from an online forum. Our dataset enables hierarchical story generation, where the model first generates a premise, and then transforms it into a passage of text. We gain further improvements with a novel form of model fusion that improves the relevance of the story to the prompt, and adding a new gated multi-scale self-attention mechanism to model long-range context. Experiments show large improvements over strong baselines on both automated and human evaluations. Human judges prefer stories generated by our approach to those from a strong non-hierarchical model by a factor of two to one.", } @inproceedings{qi2020stanza, title = "{S}tanza: A Python Natural Language Processing Toolkit for Many Human Languages", author = "Qi, Peng and Zhang, Yuhao and Zhang, Yuhui and Bolton, Jason and Manning, Christopher D.", booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: System Demonstrations", month = jul, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2020.acl-demos.14", doi = "10.18653/v1/2020.acl-demos.14", pages = "101--108", abstract = "We introduce Stanza, an open-source Python natural language processing toolkit supporting 66 human languages. Compared to existing widely used toolkits, Stanza features a language-agnostic fully neural pipeline for text analysis, including tokenization, multi-word token expansion, lemmatization, part-of-speech and morphological feature tagging, dependency parsing, and named entity recognition. We have trained Stanza on a total of 112 datasets, including the Universal Dependencies treebanks and other multilingual corpora, and show that the same neural architecture generalizes well and achieves competitive performance on all languages tested. Additionally, Stanza includes a native Python interface to the widely used Java Stanford CoreNLP software, which further extends its functionality to cover other tasks such as coreference resolution and relation extraction. Source code, documentation, and pretrained models for 66 languages are available at https://stanfordnlp.github.io/stanza/.", } @inproceedings{wolf2020transformers, title = "Transformers: State-of-the-Art Natural Language Processing", author = "Wolf, Thomas and Debut, Lysandre and Sanh, Victor and Chaumond, Julien and Delangue, Clement and Moi, Anthony and Cistac, Pierric and Rault, Tim and Louf, Remi and Funtowicz, Morgan and Davison, Joe and Shleifer, Sam and von Platen, Patrick and Ma, Clara and Jernite, Yacine and Plu, Julien and Xu, Canwen and Le Scao, Teven and Gugger, Sylvain and Drame, Mariama and Lhoest, Quentin and Rush, Alexander", booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations", month = oct, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2020.emnlp-demos.6", doi = "10.18653/v1/2020.emnlp-demos.6", pages = "38--45", abstract = "Recent progress in natural language processing has been driven by advances in both model architecture and model pretraining. Transformer architectures have facilitated building higher-capacity models and pretraining has made it possible to effectively utilize this capacity for a wide variety of tasks. Transformers is an open-source library with the goal of opening up these advances to the wider machine learning community. The library consists of carefully engineered state-of-the art Transformer architectures under a unified API. Backing this library is a curated collection of pretrained models made by and available for the community. Transformers is designed to be extensible by researchers, simple for practitioners, and fast and robust in industrial deployments. The library is available at https://github.com/huggingface/transformers.", } @inproceedings{kingma2015adam, author = {Diederik P. Kingma and Jimmy Ba}, editor = {Yoshua Bengio and Yann LeCun}, title = {Adam: {A} Method for Stochastic Optimization}, booktitle = {3rd International Conference on Learning Representations, {ICLR} 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings}, year = {2015}, url = {http://arxiv.org/abs/1412.6980} } @article{ni2021dlds, author = {Jinjie Ni and Tom Young and Vlad Pandelea and Fuzhao Xue and Vinay Adiga and Erik Cambria}, title = {Recent Advances in Deep Learning Based Dialogue Systems: {A} Systematic Survey}, journal = {CoRR}, volume = {abs/2105.04387}, year = {2021}, url = {https://arxiv.org/abs/2105.04387}, eprinttype = {arXiv}, eprint = {2105.04387}, timestamp = {Mon, 31 May 2021 08:19:46 +0200}, biburl = {https://dblp.org/rec/journals/corr/abs-2105-04387.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} } @inproceedings{wu2020tod-bert, title = "{TOD}-{BERT}: Pre-trained Natural Language Understanding for Task-Oriented Dialogue", author = "Wu, Chien-Sheng and Hoi, Steven C.H. and Socher, Richard and Xiong, Caiming", booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)", month = nov, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2020.emnlp-main.66", doi = "10.18653/v1/2020.emnlp-main.66", pages = "917--929", abstract = "The underlying difference of linguistic patterns between general text and task-oriented dialogue makes existing pre-trained language models less useful in practice. In this work, we unify nine human-human and multi-turn task-oriented dialogue datasets for language modeling. To better model dialogue behavior during pre-training, we incorporate user and system tokens into the masked language modeling. We propose a contrastive objective function to simulate the response selection task. Our pre-trained task-oriented dialogue BERT (TOD-BERT) outperforms strong baselines like BERT on four downstream task-oriented dialogue applications, including intention recognition, dialogue state tracking, dialogue act prediction, and response selection. We also show that TOD-BERT has a stronger few-shot ability that can mitigate the data scarcity problem for task-oriented dialogue.", }