Implement a character-level tokenizer with string-to-index (stoi) and index-to-string (itos) mappings. Include a special BOS (beginning of sequence) token. The tokenizer should encode text to integer sequences and decode back.
class CharTokenizer:
def __init__(self, text: str, bos_token: str = "<BOS>"):
self.bos_token = bos_token
chars = sorted(set(text))
self.stoi = {bos_token: 0}
for i, ch in enumerate(chars):
self.stoi[ch] = i + 1
self.itos = {v: k for k, v in self.stoi.items()}
self.vocab_size = len(self.stoi)
def encode(self, text: str, add_bos: bool = True) -> list[int]:
tokens = []
if add_bos:
tokens.append(self.stoi[self.bos_token])
for ch in text:
tokens.append(self.stoi[ch])
return tokens
def decode(self, tokens: list[int]) -> str:
return "".join(self.itos[t] for t in tokens if t in self.itos)stoi maps characters to indices, itos maps indices back to characters.encode converts text to a list of integer indices, optionally prepending BOS. decode converts indices back to a string.