SlitherCode
/

tiny-edu-166m

+import os
+import json
+import tiktoken
+from transformers import PreTrainedTokenizer
+VOCAB_FILES_NAMES = {"vocab_file": "tiktoken_encoding.json"}
+class TiktokenTokenizer(PreTrainedTokenizer):
+    """
+    HuggingFace-compatible tokenizer wrapping tiktoken's cl100k_base.
+    Produces byte-identical token IDs to tiktoken.get_encoding("cl100k_base").
+    Tokens are represented internally as their raw bytes decoded via latin-1
+    (a lossless bijection for arbitrary byte sequences).
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(self, vocab_file=None, encoding_name="cl100k_base", **kwargs):
+        self.encoding_name = encoding_name
+        self._build_enc()
+        eot_str = self._id_to_tok[self._enc.eot_token]  # "<|endoftext|>"
+        super().__init__(
+            encoding_name=encoding_name,
+            bos_token=eot_str,
+            eos_token=eot_str,
+            pad_token=eot_str,
+            unk_token=eot_str,
+            **kwargs,
+        )
+    def _build_enc(self):
+        self._enc = tiktoken.get_encoding(self.encoding_name)
+        self._id_to_tok = {}
+        self._tok_to_id = {}
+        for i in range(self._enc.n_vocab):
+            try:
+                s = self._enc.decode_single_token_bytes(i).decode("latin-1")
+            except Exception:
+                s = f"<|special_{i}|>"
+            self._id_to_tok[i] = s
+            self._tok_to_id[s] = i
+    # ── Required interface ─────────────────────────────────────────────────────
+    @property
+    def vocab_size(self):
+        return self._enc.n_vocab  # 100277
+    def get_vocab(self):
+        return dict(self._tok_to_id)
+    def _tokenize(self, text):
+        ids = self._enc.encode(text, allowed_special="all")
+        return [self._id_to_tok[i] for i in ids]
+    def _convert_token_to_id(self, token):
+        return self._tok_to_id.get(token, self._enc.eot_token)
+    def _convert_id_to_token(self, index):
+        return self._id_to_tok.get(index, "<|unk|>")
+    def convert_tokens_to_string(self, tokens):
+        raw = b"".join(t.encode("latin-1") for t in tokens)
+        return raw.decode("utf-8", errors="replace")
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """No BOS/EOS added — matches bare tiktoken encode() behaviour."""
+        if token_ids_1 is None:
+            return token_ids_0
+        return token_ids_0 + token_ids_1
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        os.makedirs(save_directory, exist_ok=True)
+        fname = (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        vocab_file = os.path.join(save_directory, fname)
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            json.dump({"encoding_name": self.encoding_name}, f)
+        return (vocab_file,)
+    # ── Pickle support (tiktoken objects aren't picklable) ─────────────────────
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state.pop("_enc", None)
+        state.pop("_id_to_tok", None)
+        state.pop("_tok_to_id", None)
+        return state
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        self._build_enc()