Skip to content

Commit

Permalink
pr comments
Browse files Browse the repository at this point in the history
  • Loading branch information
dakinggg committed Nov 28, 2023
1 parent 72f571a commit cf3b82b
Showing 1 changed file with 5 additions and 5 deletions.
10 changes: 5 additions & 5 deletions llmfoundry/tokenizers/tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def pickle_Encoding(enc: Encoding):
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}

self.decoder = {}
self.decoder: Dict[int, str] = {}
for i in range(self.encoding.n_vocab):
try:
self.encoding.decode_single_token_bytes(i)
Expand All @@ -141,7 +141,7 @@ def pickle_Encoding(enc: Encoding):
])
self.decoder[i] = decoding

self.encoder = {}
self.encoder: Dict[str, int] = {}
for i in range(self.encoding.n_vocab):
if i in self.decoder:
self.encoder[self.decoder[i]] = i
Expand Down Expand Up @@ -227,15 +227,15 @@ def _tokenize(self, text: str) -> List[str]:

return tokens

def _convert_token_to_id(self, token: str):
def _convert_token_to_id(self, token: str) -> Optional[int]:
"""Converts a token (str) in an id using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))

def _convert_id_to_token(self, index: int):
def _convert_id_to_token(self, index: int) -> Optional[str]:
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index)

def convert_tokens_to_string(self, tokens: List[str]):
def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""Converts a sequence of tokens (string) in a single string."""
text = ''.join(tokens)
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8')
Expand Down

0 comments on commit cf3b82b

Please sign in to comment.