File size: 3,730 Bytes
f5605f0 875a861 f5605f0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | import os
import json
import tiktoken
from transformers import PreTrainedTokenizer
VOCAB_FILES_NAMES = {"vocab_file": "tiktoken_encoding.json"}
class TiktokenTokenizer(PreTrainedTokenizer):
"""
HuggingFace-compatible tokenizer wrapping tiktoken's cl100k_base.
Produces byte-identical token IDs to tiktoken.get_encoding("cl100k_base").
Tokens are represented internally as their raw bytes decoded via latin-1
(a lossless bijection for arbitrary byte sequences).
"""
vocab_files_names = VOCAB_FILES_NAMES
model_input_names = ["input_ids", "attention_mask"]
def __init__(self, vocab_file=None, encoding_name="cl100k_base", **kwargs):
self.encoding_name = encoding_name
self._build_enc()
eot_str = self._id_to_tok[self._enc.eot_token] # "<|endoftext|>"
# When loading from saved config, special tokens are already in kwargs β
# pop them so we don't pass duplicates to super().__init__().
kwargs.pop("bos_token", None)
kwargs.pop("eos_token", None)
kwargs.pop("pad_token", None)
kwargs.pop("unk_token", None)
super().__init__(
encoding_name=encoding_name,
bos_token=eot_str,
eos_token=eot_str,
pad_token=eot_str,
unk_token=eot_str,
**kwargs,
)
def _build_enc(self):
self._enc = tiktoken.get_encoding(self.encoding_name)
self._id_to_tok = {}
self._tok_to_id = {}
for i in range(self._enc.n_vocab):
try:
s = self._enc.decode_single_token_bytes(i).decode("latin-1")
except Exception:
s = f"<|special_{i}|>"
self._id_to_tok[i] = s
self._tok_to_id[s] = i
# ββ Required interface βββββββββββββββββββββββββββββββββββββββββββββββββββββ
@property
def vocab_size(self):
return self._enc.n_vocab # 100277
def get_vocab(self):
return dict(self._tok_to_id)
def _tokenize(self, text):
ids = self._enc.encode(text, allowed_special="all")
return [self._id_to_tok[i] for i in ids]
def _convert_token_to_id(self, token):
return self._tok_to_id.get(token, self._enc.eot_token)
def _convert_id_to_token(self, index):
return self._id_to_tok.get(index, "<|unk|>")
def convert_tokens_to_string(self, tokens):
raw = b"".join(t.encode("latin-1") for t in tokens)
return raw.decode("utf-8", errors="replace")
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""No BOS/EOS added β matches bare tiktoken encode() behaviour."""
if token_ids_1 is None:
return token_ids_0
return token_ids_0 + token_ids_1
def save_vocabulary(self, save_directory, filename_prefix=None):
os.makedirs(save_directory, exist_ok=True)
fname = (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
vocab_file = os.path.join(save_directory, fname)
with open(vocab_file, "w", encoding="utf-8") as f:
json.dump({"encoding_name": self.encoding_name}, f)
return (vocab_file,)
# ββ Pickle support (tiktoken objects aren't picklable) βββββββββββββββββββββ
def __getstate__(self):
state = self.__dict__.copy()
state.pop("_enc", None)
state.pop("_id_to_tok", None)
state.pop("_tok_to_id", None)
return state
def __setstate__(self, state):
self.__dict__.update(state)
self._build_enc()
|