diff --git a/syntaxdot-tokenizers/src/xlm_roberta.rs b/syntaxdot-tokenizers/src/xlm_roberta.rs index cbbb0b2..4ef18e6 100644 --- a/syntaxdot-tokenizers/src/xlm_roberta.rs +++ b/syntaxdot-tokenizers/src/xlm_roberta.rs @@ -76,7 +76,7 @@ impl Tokenize for XlmRobertaTokenizer { // tokens. However, the input may be corrupt and use // some form of non-tab whitespace as a form, for which // sentencepiece does not return any identifier. - pieces.push(self.spp.unk_id() as i64 + FAIRSEQ_OFFSET); + pieces.push(FAIRSEQ_UNK); } }