SlitherCode commited on
Commit
f5605f0
Β·
verified Β·
1 Parent(s): e4a9f6a

Add TiktokenTokenizer (cl100k_base wrapper)

Browse files
Files changed (1) hide show
  1. tokenization_parchment.py +94 -0
tokenization_parchment.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tiktoken
4
+ from transformers import PreTrainedTokenizer
5
+
6
+ VOCAB_FILES_NAMES = {"vocab_file": "tiktoken_encoding.json"}
7
+
8
+
9
+ class TiktokenTokenizer(PreTrainedTokenizer):
10
+ """
11
+ HuggingFace-compatible tokenizer wrapping tiktoken's cl100k_base.
12
+ Produces byte-identical token IDs to tiktoken.get_encoding("cl100k_base").
13
+ Tokens are represented internally as their raw bytes decoded via latin-1
14
+ (a lossless bijection for arbitrary byte sequences).
15
+ """
16
+
17
+ vocab_files_names = VOCAB_FILES_NAMES
18
+ model_input_names = ["input_ids", "attention_mask"]
19
+
20
+ def __init__(self, vocab_file=None, encoding_name="cl100k_base", **kwargs):
21
+ self.encoding_name = encoding_name
22
+ self._build_enc()
23
+
24
+ eot_str = self._id_to_tok[self._enc.eot_token] # "<|endoftext|>"
25
+ super().__init__(
26
+ encoding_name=encoding_name,
27
+ bos_token=eot_str,
28
+ eos_token=eot_str,
29
+ pad_token=eot_str,
30
+ unk_token=eot_str,
31
+ **kwargs,
32
+ )
33
+
34
+ def _build_enc(self):
35
+ self._enc = tiktoken.get_encoding(self.encoding_name)
36
+ self._id_to_tok = {}
37
+ self._tok_to_id = {}
38
+ for i in range(self._enc.n_vocab):
39
+ try:
40
+ s = self._enc.decode_single_token_bytes(i).decode("latin-1")
41
+ except Exception:
42
+ s = f"<|special_{i}|>"
43
+ self._id_to_tok[i] = s
44
+ self._tok_to_id[s] = i
45
+
46
+ # ── Required interface ─────────────────────────────────────────────────────
47
+
48
+ @property
49
+ def vocab_size(self):
50
+ return self._enc.n_vocab # 100277
51
+
52
+ def get_vocab(self):
53
+ return dict(self._tok_to_id)
54
+
55
+ def _tokenize(self, text):
56
+ ids = self._enc.encode(text, allowed_special="all")
57
+ return [self._id_to_tok[i] for i in ids]
58
+
59
+ def _convert_token_to_id(self, token):
60
+ return self._tok_to_id.get(token, self._enc.eot_token)
61
+
62
+ def _convert_id_to_token(self, index):
63
+ return self._id_to_tok.get(index, "<|unk|>")
64
+
65
+ def convert_tokens_to_string(self, tokens):
66
+ raw = b"".join(t.encode("latin-1") for t in tokens)
67
+ return raw.decode("utf-8", errors="replace")
68
+
69
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
70
+ """No BOS/EOS added β€” matches bare tiktoken encode() behaviour."""
71
+ if token_ids_1 is None:
72
+ return token_ids_0
73
+ return token_ids_0 + token_ids_1
74
+
75
+ def save_vocabulary(self, save_directory, filename_prefix=None):
76
+ os.makedirs(save_directory, exist_ok=True)
77
+ fname = (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
78
+ vocab_file = os.path.join(save_directory, fname)
79
+ with open(vocab_file, "w", encoding="utf-8") as f:
80
+ json.dump({"encoding_name": self.encoding_name}, f)
81
+ return (vocab_file,)
82
+
83
+ # ── Pickle support (tiktoken objects aren't picklable) ─────────────────────
84
+
85
+ def __getstate__(self):
86
+ state = self.__dict__.copy()
87
+ state.pop("_enc", None)
88
+ state.pop("_id_to_tok", None)
89
+ state.pop("_tok_to_id", None)
90
+ return state
91
+
92
+ def __setstate__(self, state):
93
+ self.__dict__.update(state)
94
+ self._build_enc()