SlitherCode commited on
Commit
875a861
·
verified ·
1 Parent(s): dd7cc67

Add TiktokenTokenizer (cl100k_base wrapper)

Browse files
Files changed (1) hide show
  1. tokenization_parchment.py +8 -0
tokenization_parchment.py CHANGED
@@ -22,6 +22,14 @@ class TiktokenTokenizer(PreTrainedTokenizer):
22
  self._build_enc()
23
 
24
  eot_str = self._id_to_tok[self._enc.eot_token] # "<|endoftext|>"
 
 
 
 
 
 
 
 
25
  super().__init__(
26
  encoding_name=encoding_name,
27
  bos_token=eot_str,
 
22
  self._build_enc()
23
 
24
  eot_str = self._id_to_tok[self._enc.eot_token] # "<|endoftext|>"
25
+
26
+ # When loading from saved config, special tokens are already in kwargs —
27
+ # pop them so we don't pass duplicates to super().__init__().
28
+ kwargs.pop("bos_token", None)
29
+ kwargs.pop("eos_token", None)
30
+ kwargs.pop("pad_token", None)
31
+ kwargs.pop("unk_token", None)
32
+
33
  super().__init__(
34
  encoding_name=encoding_name,
35
  bos_token=eot_str,