classGPT4Tokenizer(RegexTokenizer): """Lightweight wrapper on RegexTokenizer that matches GPT-4's tokenizer."""
def__init__(self): super().__init__(pattern=GPT4_SPLIT_PATTERN) # get the official tokenizer and its merges enc = tiktoken.get_encoding("cl100k_base") mergeable_ranks = enc._mergeable_ranks # the merges are those of gpt4, but we have to recover them self.merges = recover_merges(mergeable_ranks) # reconstruct the vocab from the merges vocab = {idx: bytes([idx]) for idx inrange(256)} for (p0, p1), idx in self.merges.items(): vocab[idx] = vocab[p0] + vocab[p1] self.vocab = vocab # now here is another tricky part. # for some reason, the tokens corresponding to individual bytes # are permuted in a different order. This is completely non-sensical # and probably historical, but therefore we have to deal with it here. self.byte_shuffle = {i: mergeable_ranks[bytes([i])] for i inrange(256)} self.inverse_byte_shuffle = {v: k for k, v in self.byte_shuffle.items()} # finally register the special tokens self.register_special_tokens(GPT4_SPECIAL_TOKENS)
但是这里为什么要乱序,不是很理解,给的注释是,for some reason……and probably historical,它对基础的256个词汇进行了shuffle,因为mergeable_ranks就是一个字典,我打印了前300个,可以和正常的vocab对比一下输出
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
import tiktoken
enc = tiktoken.get_encoding("cl100k_base") mergeable_ranks = enc._mergeable_ranks print(type(mergeable_ranks)) i = 0 for k,v in mergeable_ranks.items(): print(k,v) i+=1 if(i>=300): break print("-------------------") vocab = {idx: bytes([idx]) for idx inrange(256)} for i in vocab: print(i,vocab[i])
defbpe(mergeable_ranks, token, max_rank): # helper function used in get_gpt4_merges() to reconstruct the merge forest parts = [bytes([b]) for b in token] whileTrue: min_idx = None min_rank = None for i, pair inenumerate(zip(parts[:-1], parts[1:])): rank = mergeable_ranks.get(pair[0] + pair[1]) if rank isnotNoneand (min_rank isNoneor rank < min_rank): min_idx = i min_rank = rank if min_rank isNoneor (max_rank isnotNoneand min_rank >= max_rank): break assert min_idx isnotNone parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] return parts
defrecover_merges(mergeable_ranks): # the `merges` are already the byte sequences in their merged state. # so we have to recover the original pairings. We can do this by doing # a small BPE training run on all the tokens, in their order. # also see https://github.com/openai/tiktoken/issues/60 # also see https://github.com/karpathy/minbpe/issues/11#issuecomment-1950805306 merges = {} for token, rank in mergeable_ranks.items(): iflen(token) == 1: continue# skip raw bytes pair = tuple(bpe(mergeable_ranks, token, max_rank=rank)) assertlen(pair) == 2 # recover the integer ranks of the pair ix0 = mergeable_ranks[pair[0]] ix1 = mergeable_ranks[pair[1]] merges[(ix0, ix1)] = rank