@inproceedings{kyrylov-chaplynskyi-2023-gpt,
title = "{GPT}-2 Metadata Pretraining Towards Instruction Finetuning for {U}krainian",
author = "Kyrylov, Volodymyr and
Chaplynskyi, Dmytro",
editor = "Romanyshyn, Mariana",
booktitle = "Proceedings of the Second Ukrainian Natural Language Processing Workshop (UNLP)",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.unlp-1.4",
doi = "10.18653/v1/2023.unlp-1.4",
pages = "32--39",
abstract = "We explore pretraining unidirectional language models on 4B tokens from the largest curated corpus of Ukrainian, UberText 2.0. We enrich document text by surrounding it with weakly structured metadata, such as title, tags, and publication year, enabling metadata-conditioned text generation and text-conditioned metadata prediction at the same time. We pretrain GPT-2 Small, Medium and Large models each on single GPU, reporting training times, BPC on BrUK and BERTScore on titles for 1000 News from the Future. Next, we venture to formatting POS and NER datasets as instructions, and train low-rank attention adapters, performing these tasks as constrained text generation. We release our models for the community at \url{https://github.com/proger/uk4b}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kyrylov-chaplynskyi-2023-gpt">
<titleInfo>
<title>GPT-2 Metadata Pretraining Towards Instruction Finetuning for Ukrainian</title>
</titleInfo>
<name type="personal">
<namePart type="given">Volodymyr</namePart>
<namePart type="family">Kyrylov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dmytro</namePart>
<namePart type="family">Chaplynskyi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Ukrainian Natural Language Processing Workshop (UNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mariana</namePart>
<namePart type="family">Romanyshyn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We explore pretraining unidirectional language models on 4B tokens from the largest curated corpus of Ukrainian, UberText 2.0. We enrich document text by surrounding it with weakly structured metadata, such as title, tags, and publication year, enabling metadata-conditioned text generation and text-conditioned metadata prediction at the same time. We pretrain GPT-2 Small, Medium and Large models each on single GPU, reporting training times, BPC on BrUK and BERTScore on titles for 1000 News from the Future. Next, we venture to formatting POS and NER datasets as instructions, and train low-rank attention adapters, performing these tasks as constrained text generation. We release our models for the community at https://github.com/proger/uk4b.</abstract>
<identifier type="citekey">kyrylov-chaplynskyi-2023-gpt</identifier>
<identifier type="doi">10.18653/v1/2023.unlp-1.4</identifier>
<location>
<url>https://aclanthology.org/2023.unlp-1.4</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>32</start>
<end>39</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GPT-2 Metadata Pretraining Towards Instruction Finetuning for Ukrainian
%A Kyrylov, Volodymyr
%A Chaplynskyi, Dmytro
%Y Romanyshyn, Mariana
%S Proceedings of the Second Ukrainian Natural Language Processing Workshop (UNLP)
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F kyrylov-chaplynskyi-2023-gpt
%X We explore pretraining unidirectional language models on 4B tokens from the largest curated corpus of Ukrainian, UberText 2.0. We enrich document text by surrounding it with weakly structured metadata, such as title, tags, and publication year, enabling metadata-conditioned text generation and text-conditioned metadata prediction at the same time. We pretrain GPT-2 Small, Medium and Large models each on single GPU, reporting training times, BPC on BrUK and BERTScore on titles for 1000 News from the Future. Next, we venture to formatting POS and NER datasets as instructions, and train low-rank attention adapters, performing these tasks as constrained text generation. We release our models for the community at https://github.com/proger/uk4b.
%R 10.18653/v1/2023.unlp-1.4
%U https://aclanthology.org/2023.unlp-1.4
%U https://doi.org/10.18653/v1/2023.unlp-1.4
%P 32-39
Markdown (Informal)
[GPT-2 Metadata Pretraining Towards Instruction Finetuning for Ukrainian](https://aclanthology.org/2023.unlp-1.4) (Kyrylov & Chaplynskyi, UNLP 2023)
ACL