Documentation
¶
Index ¶
- Constants
- type Decoder
- type Info
- type Vocab
- func (vocab *Vocab) AddSpecialToken(token []byte)
- func (vocab *Vocab) AddSpecialTokens(specialTokens [][]byte, size int)
- func (vocab *Vocab) AddToken(token []byte)
- func (vocab *Vocab) AddTokens(addTokens [][]byte, specialTokens [][]byte, size int)
- func (vocab *Vocab) Capcode() uint8
- func (vocab *Vocab) Charset() uint8
- func (vocab *Vocab) Count(data []byte) (int, int, error)
- func (vocab *Vocab) Decode(tokens []uint32) []byte
- func (vocab *Vocab) DecodeSerialized(b []byte, encodingLength uint8, buffer []byte) []byte
- func (vocab *Vocab) DeleteToken(token []byte)
- func (vocab *Vocab) DeleteTokenID(id uint32)
- func (vocab *Vocab) DeleteTokens(deleteTokens [][]byte, size int)
- func (vocab *Vocab) Denormalize(b []byte) []byte
- func (vocab *Vocab) Deserialize(data []byte, encodingLength uint8) (tokens []uint32)
- func (vocab *Vocab) DisableUnkToken()
- func (vocab *Vocab) EnableUnkToken() bool
- func (vocab *Vocab) ExportYAML(writer io.Writer, orderByScore bool)
- func (vocab *Vocab) HasUnk() bool
- func (vocab *Vocab) HighestTokenID() int
- func (vocab *Vocab) IdToToken(id uint32) []byte
- func (vocab *Vocab) Len() int
- func (vocab *Vocab) MaxTokenLength() int
- func (vocab *Vocab) Mode() uint8
- func (vocab *Vocab) ModifyVocabulary(addTokens [][]byte, specialTokens [][]byte, deleteTokens [][]byte, size int, ...)
- func (vocab *Vocab) ModifyVocabularyFromYAML(yml []byte, size int, resetTokenIds bool)
- func (vocab *Vocab) NewDecoder() *Decoder
- func (vocab *Vocab) Normalization() string
- func (vocab *Vocab) NormalizationCode() uint8
- func (vocab *Vocab) Normalize(data []byte) ([]byte, error)
- func (vocab *Vocab) NumDeletedTokens() int
- func (vocab *Vocab) NumSingleByteTokens() int
- func (vocab *Vocab) NumSpecialTokens() int
- func (vocab *Vocab) PrivateGenerateVocab(yamlData []byte, tokens [][]byte, scores []float32, addTokens [][]byte, ...) error
- func (vocab *Vocab) ResetTokenIds(token []byte)
- func (vocab *Vocab) Resize(size int)
- func (vocab Vocab) Save(outputFilename string) error
- func (vocab Vocab) SaveWithMapping(outputFilename string, mapping []uint32) error
- func (vocab *Vocab) SingleByteTokens() []byte
- func (vocab *Vocab) SingleBytesTrainingCode() uint8
- func (vocab *Vocab) SpecialTokens() []Info
- func (vocab *Vocab) TokenToId(b []byte) (uint32, bool)
- func (vocab *Vocab) Tokenize(data []byte) ([]uint32, int, error)
- func (vocab *Vocab) TokenizeToSerialized(data []byte, encodingLength uint8, buffer []byte) ([]byte, uint8, int, error)
- func (vocab *Vocab) Tokens() [][]byte
- func (vocab *Vocab) TokensDetailed() []Info
- func (vocab *Vocab) Unk() uint32
- type YamlItem
- type YamlVocab
Constants ¶
const (
DOES_NOT_EXIST = 16777215
)
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type Decoder ¶
type Decoder struct {
// contains filtered or unexported fields
}
A decoder object for sequential decoding. Use the NewDecoder function of the Vocab struct.
func (*Decoder) DecodeSerialized ¶
Decodes tokens from a serialized bytes slice. `encodingLength` must be one of: 0, 2, 3, 4. If you enter `encodingLength` 0 then it will determine the encoding length from the vocabulary size. `buffer` is optional, you can send it `nil` and it will allocate a new slice.
func (*Decoder) Deserialize ¶
Deserializes tokens encoded in a bytes stream into a slice of uint32 token IDs. `encodingLength` must be one of: 0, 2, 3, 4. If you enter `encodingLength` 0 then it will determine the encoding length from the vocabulary size.
type Info ¶
type Info struct { Id uint32 Token []byte TokenDecoded []byte Type uint8 // 0 = regular, 1 = character, 2 = special, 3 = unk Score float32 }
Info struct allows access to detailed information about each token from TokensDetailed(). Token is the token still encoded with capcode. TokenDecoded is the decoded form of the token, however the token can be modified by a previous token in a sequence so this cannot be used for decoding. Type is 0 for regular tokens, 1 for character tokens, 2 for special tokens, 3 for UNK token. The Score is the percentage of the training dataset that this token covered and is used for sorting the tokens by their importance.
type Vocab ¶
type Vocab struct {
// contains filtered or unexported fields
}
The main struct for the vocabulary
func NewVocab ¶
func NewVocab(tokens [][]byte, specialTokens [][]byte, charset uint8, normalization string, usingCapcode uint8, include256bytes bool, include128bytes bool, includeUTF8bytes bool, includeASCIIbytes bool, includeExtendedBytes bool, excludeOtherBytes bool) (*Vocab, error)
NewVocab makes a fresh vocabulary from a custom list of tokens. If you generated your vocabulary with TokenMonster tools, you will not be using this function but instead using `Load`.
func NewVocabFromYAML ¶
NewVocabFromYAML makes a fresh vocabulary from a YAML file.
func (*Vocab) AddSpecialToken ¶
Adds a single special token to the vocabulary. A special token is special because only this token is allowed to tokenize text containing this. If any regular tokens contain your special token within them, they will be deleted. Modifying a vocabulary does not change existing token IDs. All normalization and capcode is applied automatically.
func (*Vocab) AddSpecialTokens ¶
Add multiple special tokens and optionally resize. Enter `size` 0 to not resize. Modifying a vocabulary does not change existing token IDs.
func (*Vocab) AddToken ¶
Adds a single token to the vocabulary. Modifying a vocabulary does not change existing token IDs. All normalization and capcode is applied automatically.
func (*Vocab) AddTokens ¶
Adds multiple regular and optionally special tokens. You can use `size` to resize the vocabulary to keep it at a specific size. Enter `size` 0 to not resize. Modifying a vocabulary does not change existing token IDs.
func (*Vocab) Decode ¶
Decodes tokens backs into bytes. If you are decoding a stream of tokens individually or in batches, instead of all at once, you should use the Decode method for the Decoder struct instead.
func (*Vocab) DecodeSerialized ¶
Decodes tokens from a serialized bytes slice. `encodingLength` must be one of: 0, 2, 3, 4. If you enter `encodingLength` 0 then it will determine the encoding length from the vocabulary size. `buffer` is optional, you can send it `nil` and it will allocate a new slice. If you are decoding a stream of tokens individually or in batches, instead of all at once, you should use the Decode method for the Decoder struct instead.
func (*Vocab) DeleteToken ¶
Deletes a single token from the vocabulary. Tokens to delete can be capcoded encoded or not, it will look for both. Modifying a vocabulary does not change existing token IDs.
func (*Vocab) DeleteTokenID ¶
Deletes a single token from the vocabulary by specifying the ID. Modifying a vocabulary does not change existing token IDs.
func (*Vocab) DeleteTokens ¶
Delete multiple tokens and optionally resize. Tokens to delete can be capcoded encoded or not, it will look for both. Enter `size` 0 to not resize. Modifying a vocabulary does not change existing token IDs.
func (*Vocab) Denormalize ¶
Decodes capcode from the bytes.
func (*Vocab) Deserialize ¶
func (*Vocab) DisableUnkToken ¶
func (vocab *Vocab) DisableUnkToken()
Disables the UNK token. Without an UNK token, a character that has no token to represent it will be ignored.
func (*Vocab) EnableUnkToken ¶
Enables the UNK token. Returns true if successful, returns false if an UNK token is not applicable to this vocabulary (all bytes have tokens). If enabled, UNK token will be inserted for every character for which there is no token. You can resize after this if you want to keep the vocabulary sized as it was before, otherwise it will be 1 larger.
func (*Vocab) ExportYAML ¶
Exports the vocabulary to a human-readable YAML file. It writes to an io.Writer. You can import from YAML with NewVocabFromYAML().
func (*Vocab) HasUnk ¶
Returns true if the vocabulary is using the UNK token. If used, the UNK token ID is used whenever a character being tokenized doesn't exist in the vocabulary.
func (*Vocab) HighestTokenID ¶
Returns the value of the highest token ID.
func (*Vocab) MaxTokenLength ¶
The length of the longest (encoded) token in the vocabulary. This can be lower than that chosen during training if none of the longer tokens were chosen.
func (*Vocab) Mode ¶
The original filter for training the vocabulary. 0 = unfiltered, 1 = clean, 2 = balanced, 3 = consistent, 4 = strict, 5 = not trained with trainvocab.
func (*Vocab) ModifyVocabulary ¶
func (vocab *Vocab) ModifyVocabulary(addTokens [][]byte, specialTokens [][]byte, deleteTokens [][]byte, size int, resetTokenIds bool)
Add regular & special tokens, delete tokens and resize, all in one. Modifying a vocabulary does not change existing token IDs. Pass resetTokenIds = true to ensure there are no gaps in the token IDs.
func (*Vocab) ModifyVocabularyFromYAML ¶
Add regular & special tokens, delete tokens and resize, all in one. Modifying a vocabulary does not change existing token IDs. Pass resetTokenIds = true to ensure there are no gaps in the token IDs.
func (*Vocab) NewDecoder ¶
Creates a new Decoder instance. This is for decoding tokens in a sequence when they are to be decoded individually or in batches. If you are decoding all in one go, you can use the Vocab's Decode method.
func (*Vocab) Normalization ¶
The type of normalization applied automatically when tokenizing. Returns a string.
func (*Vocab) NormalizationCode ¶
The type of normalization applied automatically when tokenizing. Returns a uint8.
func (*Vocab) NumDeletedTokens ¶
The number of tokens deleted from the vocabulary. These can be restored by resizing the vocabulary to be be larger.
func (*Vocab) NumSingleByteTokens ¶
The number of single byte tokens in the vocabulary.
func (*Vocab) NumSpecialTokens ¶
Returns the number of special tokens in the vocabulary.
func (*Vocab) PrivateGenerateVocab ¶
func (vocab *Vocab) PrivateGenerateVocab(yamlData []byte, tokens [][]byte, scores []float32, addTokens [][]byte, deleteTokens [][]byte, specialTokens [][]byte, specialTokensEncoded [][]byte, charset uint8, normalizeString string, usingCapcode uint8, level uint8, reserve uint8, resize int, resetTokenIds bool) error
Don't use this function, it's exported because it's used by the exportvocab tool.
func (*Vocab) ResetTokenIds ¶
Resets all the IDs of the tokens to be assigned alphabetically, starting from 0, with no gaps.
func (*Vocab) Resize ¶
Resize the vocabulary by deleting the worst scoring tokens. You can also resize the vocabulary to be larger if any tokens have previously been deleted. Modifying a vocabulary does not change existing token IDs.
func (Vocab) SaveWithMapping ¶
func (*Vocab) SingleByteTokens ¶
A slice that contains all the single byte tokens in the vocabulary. Note that this is returned as only a slice of bytes, not a slice of slice of bytes.
func (*Vocab) SingleBytesTrainingCode ¶
Returns the uint8 code corresponding to the training parameters for single byte tokens.
func (*Vocab) SpecialTokens ¶
Returns the token IDs and the corresponding tokens of only the. Set `decode` to false to receive the decoded form of the tokens.
func (*Vocab) TokenToId ¶
Returns the ID of the token from bytes. This only works for capcode encoded tokens. Apply `Normalize` to the bytes first to use this with decoded tokens.
func (*Vocab) Tokenize ¶
Tokenizes text from bytes slice to token IDs. The 2nd returned value (int) is the number of characters for which there were no tokens and were replaced with Unk token.
func (*Vocab) TokenizeToSerialized ¶
func (vocab *Vocab) TokenizeToSerialized(data []byte, encodingLength uint8, buffer []byte) ([]byte, uint8, int, error)
Tokenizes directly into serialized bytes with either 16-bit, 24-bit or 32-bit encoded unsigned integers depending on the vocabulary size. Set encodingLength to 0 for it to be chosen automatically, or set `encodingLength` to 2, 3 or 4. The 2rd return value is the encodingLength that was used, and the 3rd is the number of characters for which there were no tokens. `buffer` is an optional reusable buffer, you can send nil.
func (*Vocab) Tokens ¶
Returns a slice of all tokens in the vocabulary (excluding UNK), in their encoded capcode form.
func (*Vocab) TokensDetailed ¶
Returns a slice of Info struct where the index is the Token ID
type YamlVocab ¶
type YamlVocab struct { Charset string `yaml:"charset,omitempty"` Normalization string `yaml:"normalization,omitempty"` Capcode int `yaml:"capcode,omitempty"` TrainingParam *int `yaml:"training-param,omitempty"` ResetTokenIds bool `yaml:"reset-token-ids,omitempty"` Include256Bytes bool `yaml:"include-256-bytes,omitempty"` Include128Bytes bool `yaml:"include-128-bytes,omitempty"` IncludeUtf8Bytes bool `yaml:"include-utf8-bytes,omitempty"` IncludeAsciiBytes bool `yaml:"include-ascii-bytes,omitempty"` IncludeExtendedBytes bool `yaml:"include-extended-bytes,omitempty"` ExcludeOtherBytes bool `yaml:"exclude-other-bytes,omitempty"` Unk bool `yaml:"unk,omitempty"` UnkId *int `yaml:"unk-id,omitempty"` Regular []YamlItem `yaml:"tokens,omitempty"` Special []YamlItem `yaml:"special,omitempty"` Delete []YamlItem `yaml:"delete,omitempty"` }