From b12380232e64b66bbff15221af95b49e4e4a7a8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Thu, 19 Oct 2023 09:31:15 +0200 Subject: [PATCH] XLM-R tokenizer, return correct unk id for corrupted input This should never happen, but we returned the incorrect unknown piece identifier in the worst-case fallback (where tokenization doesn't return any pieces). --- syntaxdot-tokenizers/src/xlm_roberta.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/syntaxdot-tokenizers/src/xlm_roberta.rs b/syntaxdot-tokenizers/src/xlm_roberta.rs index cbbb0b2..4ef18e6 100644 --- a/syntaxdot-tokenizers/src/xlm_roberta.rs +++ b/syntaxdot-tokenizers/src/xlm_roberta.rs @@ -76,7 +76,7 @@ impl Tokenize for XlmRobertaTokenizer { // tokens. However, the input may be corrupt and use // some form of non-tab whitespace as a form, for which // sentencepiece does not return any identifier. - pieces.push(self.spp.unk_id() as i64 + FAIRSEQ_OFFSET); + pieces.push(FAIRSEQ_UNK); } }