Add TiktokenTokenizer (cl100k_base wrapper)
Browse files
tokenization_parchment.py
CHANGED
|
@@ -22,6 +22,14 @@ class TiktokenTokenizer(PreTrainedTokenizer):
|
|
| 22 |
self._build_enc()
|
| 23 |
|
| 24 |
eot_str = self._id_to_tok[self._enc.eot_token] # "<|endoftext|>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
super().__init__(
|
| 26 |
encoding_name=encoding_name,
|
| 27 |
bos_token=eot_str,
|
|
|
|
| 22 |
self._build_enc()
|
| 23 |
|
| 24 |
eot_str = self._id_to_tok[self._enc.eot_token] # "<|endoftext|>"
|
| 25 |
+
|
| 26 |
+
# When loading from saved config, special tokens are already in kwargs —
|
| 27 |
+
# pop them so we don't pass duplicates to super().__init__().
|
| 28 |
+
kwargs.pop("bos_token", None)
|
| 29 |
+
kwargs.pop("eos_token", None)
|
| 30 |
+
kwargs.pop("pad_token", None)
|
| 31 |
+
kwargs.pop("unk_token", None)
|
| 32 |
+
|
| 33 |
super().__init__(
|
| 34 |
encoding_name=encoding_name,
|
| 35 |
bos_token=eot_str,
|