diff --git a/package.json b/package.json index d7d2e9c..eb8bd8c 100644 --- a/package.json +++ b/package.json @@ -153,7 +153,12 @@ "dumbpilot.llamaTop_k": {"type": "number", "default": 40}, "dumbpilot.llamaTypical_p": {"type": "number", "default": 0.95}, "dumbpilot.llamaTailfree_z": {"type": "number", "default": 0.5}, - "dumbpilot.llamaSeed": {"type": "number", "default": -1} + "dumbpilot.llamaSeed": {"type": "number", "default": -1}, + "dumbpilot.llamaCachePrompt": { + "type": "bool", + "default": true, + "description": "Enable prompt caching for faster results" + } } } }, diff --git a/src/extension.ts b/src/extension.ts index 1a32283..7f9c24a 100644 --- a/src/extension.ts +++ b/src/extension.ts @@ -29,7 +29,7 @@ type llamaData = { truncated: boolean }; -type llamaCompletionRequest = { +type llamaRequest = { n_predict: number, mirostat: number, repeat_penalty: number, @@ -43,25 +43,10 @@ type llamaCompletionRequest = { tfs_z: number, seed: number, stream: boolean, - prompt: string, -}; - -type llamaFillRequest = { - n_predict: number, - mirostat: number, - repeat_penalty: number, - frequency_penalty: number, - presence_penalty: number, - repeat_last_n: number, - temperature: number, - top_p: number, - top_k: number, - typical_p: number, - tfs_z: number, - seed: number, - stream: boolean, - input_prefix: string, - input_suffix: string + cache_prompt: boolean, + prompt?: string, + input_prefix?: string, + input_suffix?: string }; type fetchErrorCause = { @@ -118,7 +103,13 @@ export function activate(context: vscode.ExtensionContext) { console.log('dumbpilot is now active'); - const config = vscode.workspace.getConfiguration("dumbpilot"); + let config = vscode.workspace.getConfiguration("dumbpilot"); + + // handle completion changes + context.subscriptions.push(vscode.workspace.onDidChangeConfiguration(e => { + config = vscode.workspace.getConfiguration("dumbpilot"); + })); + let completion_enabled: boolean = config.get("completionEnabled") as boolean; // TODO: work with local configurations @@ -183,8 +174,9 @@ export function activate(context: vscode.ExtensionContext) { // FIXME: is there a more efficient way? doc_before = pfx + ' ' + fname + sfx + '\n' + doc_before; - // server request object - const request: llamaCompletionRequest = { + const fim = config.get("fimEnabled") as boolean; + let req_str: string; + let request: llamaRequest = { n_predict: config.get("llamaMaxtokens") as number, mirostat: config.get("llamaMirostat") as number, repeat_penalty: config.get("llamaRepeatPenalty") as number, @@ -198,14 +190,24 @@ export function activate(context: vscode.ExtensionContext) { tfs_z: config.get("llamaTailfree_z,") as number, seed: config.get("llamaSeed") as number, stream: false, - prompt: doc_before, + cache_prompt: config.get("llamaCachePrompt") as boolean }; - + + if (fim === true) { + req_str = '/infill'; + request.input_prefix = doc_before; + request.input_suffix = doc_after; + } else { + req_str = '/completion'; + request.prompt = doc_before; + } + console.log(fim); + let data: llamaData; // try to send the request to the running server try { const response_promise = fetch( - (config.get("llamaHost") as string).concat('/completion'), + (config.get("llamaHost") as string).concat(req_str), { method: 'POST', headers: {