avante.nvim/lua/avante/tokenizers.lua
Aaron Pham d2095ba267
feat: tokenizers (#407)
* feat: autobuild tiktoken lib and schenanigans

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

* chore: revert readme changes

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

* fix(build): windows

Signed-off-by: Hanchin Hsieh <me@yuchanns.xyz>

* chore(plugin): early load commands and base setup

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

* fix(build): make sync

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

* feat: rust go vroom vroom

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

* feat: scuffed afaf implementation binding go brrrr

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

* chore: remove dups

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

* fix(tokens): calculate whether we should do prompt_caching (fixes #416)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

* chore: ignore lockfiles

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

* Update README.md

* Update crates/avante-tokenizers/README.md

* chore: remove unused

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

* chore: remove auto build

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

---------

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Hanchin Hsieh <me@yuchanns.xyz>
Co-authored-by: yuchanns <me@yuchanns.xyz>
2024-08-31 07:19:59 -04:00

67 lines
1.3 KiB
Lua

local Utils = require("avante.utils")
---@class AvanteTokenizer
---@field from_pretrained fun(model: string): nil
---@field encode fun(string): integer[]
local tokenizers = nil
local M = {}
---@param model "gpt-4o" | string
M.setup = function(model)
local ok, core = pcall(require, "avante_tokenizers")
if not ok then
return
end
---@cast core AvanteTokenizer
if tokenizers == nil then
tokenizers = core
end
local HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN == nil and model ~= "gpt-4o" then
Utils.warn(
"Please set HF_TOKEN environment variable to use HuggingFace tokenizer if " .. model .. " is gated",
{ once = true }
)
end
vim.env.HF_HUB_DISABLE_PROGRESS_BARS = 1
---@cast core AvanteTokenizer
core.from_pretrained(model)
end
M.available = function()
return tokenizers ~= nil
end
---@param prompt string
M.encode = function(prompt)
if not tokenizers then
return nil
end
if not prompt or prompt == "" then
return nil
end
if type(prompt) ~= "string" then
error("Prompt is not type string", 2)
end
return tokenizers.encode(prompt)
end
---@param prompt string
M.count = function(prompt)
if not tokenizers then
return math.ceil(#prompt * 0.5)
end
local tokens = M.encode(prompt)
if not tokens then
return 0
end
return #tokens
end
return M