From 960b2190bf3a63db545bbbe6d8599f6578a859c8 Mon Sep 17 00:00:00 2001 From: Alessandro Mauri Date: Sat, 16 Dec 2023 16:56:44 +0100 Subject: [PATCH] separate llama.cpp server api into a different source file --- TODO.md | 4 +- package.json | 17 ++++ src/common.ts | 58 ++++++++++++++ src/extension.ts | 129 ++++++++---------------------- src/llamacpp-api.ts | 191 ++++++++++++++++++++++++++++---------------- src/openai-api.ts | 116 +++++++++++++++++++++++++++ 6 files changed, 350 insertions(+), 165 deletions(-) create mode 100644 src/common.ts diff --git a/TODO.md b/TODO.md index f4b3899..103c779 100644 --- a/TODO.md +++ b/TODO.md @@ -1,4 +1,4 @@ -[x] - in extensions.json add suffix for languages that require it such as css where comments are: /* stuff */ +[x] - in extensions.json add suffix for languages that require it such as css where comments are: /_ stuff _/ [] - test cancel token [] - add fill in middle [x] - add config option to disable the extension @@ -10,4 +10,4 @@ [] - option to backup and restore model settings [] - add a window to quickly modify model configs [] - decorate ai generated text https://github.com/microsoft/vscode-extension-samples/tree/main/decorator-sample -[] - when trying to use completion when there is an active selection either substitute the selection or use the selection as context instead of the whole file \ No newline at end of file +[] - when trying to use completion when there is an active selection either substitute the selection or use the selection as context instead of the whole file diff --git a/package.json b/package.json index 09b0419..1aa5fe3 100644 --- a/package.json +++ b/package.json @@ -223,6 +223,16 @@ "dumbpilot.llamaSystemPrompt": { "type": "string", "description": "The system prompt that the model considers at the beginning of every request, used by instruct models" + }, + "dumbpilot.llamaUseOpenAIAPI": { + "type": "boolean", + "default": true, + "description": "Use the OpenAI API to make requests to the server instead of the llama.cpp server API" + }, + "dumbpilot.llamaModelName": { + "type": "string", + "default": "deepseek-coder-6.7B-base.gguf", + "description": "Name of the model to use, only works in OpenAI API mode" } } } @@ -246,5 +256,12 @@ "mocha": "^10.2.0", "typescript": "^5.2.2", "@vscode/test-electron": "^2.3.6" + }, + "prettier": { + "tabWidth": 4, + "printWidth": 100, + "useTabs": true, + "singleQuote": true, + "trailingComma": "es5" } } \ No newline at end of file diff --git a/src/common.ts b/src/common.ts new file mode 100644 index 0000000..5cc1833 --- /dev/null +++ b/src/common.ts @@ -0,0 +1,58 @@ +import * as vscode from 'vscode'; + +// common data structures and functions + +export type FetchErrorCause = { + errno: number; + code: string; + syscall: string; + address: string; + port: number; +}; + +// a summary of the received data +export type ResponseData = { + content: string; + tokens: number; + time: number; +}; + +// Show a message notification with a set timeout +export async function showMessageWithTimeout(message: string, timeout: number): Promise { + void vscode.window.withProgress( + { + location: vscode.ProgressLocation.Notification, + title: message, + cancellable: false, + }, + (progress, token) => { + token.onCancellationRequested(() => {}); + + // This is magic I don't understand + const p = new Promise((resolve) => { + setTimeout(resolve, timeout); + }); + return p; + } + ); +} + +// show a message on the status bar until the promise is resolved +export async function showPendingStatusBar( + message: string, + operation: Promise +): Promise { + void vscode.window + .withProgress( + { + location: vscode.ProgressLocation.Window, + title: message, + }, + () => operation + ) + .then( + (aok) => {}, + (err) => {} + ); + // we already resolve the operation elsewhere +} diff --git a/src/extension.ts b/src/extension.ts index 528907d..975b523 100644 --- a/src/extension.ts +++ b/src/extension.ts @@ -1,17 +1,19 @@ import { ok } from 'assert'; import * as vscode from 'vscode'; import commentPrefix from './comments.json'; -import {createLlamacppRequest, llamaData, llamaRequest, llamacppRequestEndpoint} from './llamacpp-api'; - - -type fetchErrorCause = { - errno: number, - code: string, - syscall: string, - address: string, - port: number -}; - +import { + LlamaData, + LlamaRequest, + createLlamacppRequest, + llamacppRequestEndpoint, + llamacppMakeRequest, +} from './llamacpp-api'; +import { + FetchErrorCause, + ResponseData, + showMessageWithTimeout, + showPendingStatusBar, +} from './common'; // clean up the document function clean_text(txt: string): string { @@ -24,56 +26,26 @@ function clean_text(txt: string): string { return txt; } - -// Show a message notification with a set timeout -async function showMessageWithTimeout(message: string, timeout: number): Promise { - void vscode.window.withProgress( - { - location: vscode.ProgressLocation.Notification, - title: message, - cancellable: false, - }, (progress, token) => { - token.onCancellationRequested(() => {}); - - // This is magic I don't understand - const p = new Promise((resolve) => { - setTimeout(resolve, timeout); - }); - return p; - }); -}; - - -// show a message on the status bar until the promise is resolved -async function showPendingStatusBar(message: string, operation: Promise): Promise { - void vscode.window.withProgress( - { - location: vscode.ProgressLocation.Window, - title: message, - }, () => operation ).then((aok) => {}, (err) => {}); - // we already resolve the operation elsewhere -} - - export function activate(context: vscode.ExtensionContext) { - console.log('dumbpilot is now active'); - let config = vscode.workspace.getConfiguration("dumbpilot"); + let config = vscode.workspace.getConfiguration('dumbpilot'); // handle completion changes - context.subscriptions.push(vscode.workspace.onDidChangeConfiguration(e => { - config = vscode.workspace.getConfiguration("dumbpilot"); - })); + context.subscriptions.push( + vscode.workspace.onDidChangeConfiguration((e) => { + config = vscode.workspace.getConfiguration('dumbpilot'); + }) + ); // TODO: work with local configurations - let disposable = vscode.commands.registerCommand("dumbpilot.enableCompletion", () => { - config.update("completionEnabled", true); + let disposable = vscode.commands.registerCommand('dumbpilot.enableCompletion', () => { + config.update('completionEnabled', true); }); context.subscriptions.push(disposable); - disposable = vscode.commands.registerCommand("dumbpilot.disableCompletion", () => { - config.update("completionEnabled", false); + disposable = vscode.commands.registerCommand('dumbpilot.disableCompletion', () => { + config.update('completionEnabled', false); }); // Register a new provider of inline completions, this does not decide how it is invoked @@ -81,9 +53,8 @@ export function activate(context: vscode.ExtensionContext) { // https://github.com/microsoft/vscode-extension-samples/blob/main/inline-completions/src/extension.ts const provider: vscode.InlineCompletionItemProvider = { async provideInlineCompletionItems(document, position, context, token) { - // disable if predictive completion is disabled - if (config.get("completionEnabled") as boolean === false) { + if ((config.get('completionEnabled') as boolean) === false) { return null; } @@ -95,14 +66,14 @@ export function activate(context: vscode.ExtensionContext) { // FIXME: I don't know if this works token.onCancellationRequested(() => { - console.log("dumbpilot: operation cancelled, may still be running on the server"); + console.log('dumbpilot: operation cancelled, may still be running on the server'); return null; }); //console.log('dumbpilot: completion invoked at position: line=' + position.line + ' char=' + position.character); const result: vscode.InlineCompletionList = { - items: [] + items: [], }; // Get the document's text and position to send to the model @@ -127,49 +98,19 @@ export function activate(context: vscode.ExtensionContext) { // FIXME: is there a more efficient way? doc_before = pfx + ' ' + fname + sfx + '\n' + doc_before; - const request: llamaRequest = createLlamacppRequest(config, doc_before, doc_after); - console.log(JSON.stringify(request)); - - let data: llamaData; - // try to send the request to the running server - try { - const response_promise = fetch( - llamacppRequestEndpoint(config), - { - method: 'POST', - headers: { - 'content-type': 'application/json; charset=UTF-8' - }, - body: JSON.stringify(request) - } - ); - - showPendingStatusBar("dumbpilot waiting", response_promise); - const response = await response_promise; - if (response.ok === false) { - throw new Error("llama server request is not ok??"); - } - - data = await response.json() as llamaData; - const gen_tokens = data.timings.predicted_n; - const gen_time = (data.timings.predicted_ms / 1000).toFixed(2); - showMessageWithTimeout(`predicted ${gen_tokens} tokens in ${gen_time} seconds`, 1500); - - } catch (e: any) { - const err = e as TypeError; - const cause: fetchErrorCause = err.cause as fetchErrorCause; - const estr: string = err.message + ' ' + cause.code + ' at ' + cause.address + ':' + cause.port; - // let the user know something went wrong - // TODO: maybe add a retry action or something - showMessageWithTimeout('dumbpilot error: ' + estr, 3000); - return null; - }; + // actially make the request + const request: LlamaRequest = createLlamacppRequest(config, doc_before, doc_after); + const endpoint: string = llamacppRequestEndpoint(config); + let data: ResponseData = await llamacppMakeRequest(request, endpoint); - result.items.push({insertText: data.content, range: new vscode.Range(position, position)}); + result.items.push({ + insertText: data.content, + range: new vscode.Range(position, position), + }); return result; }, }; - vscode.languages.registerInlineCompletionItemProvider({pattern: '**'}, provider); + vscode.languages.registerInlineCompletionItemProvider({ pattern: '**' }, provider); } // This method is called when your extension is deactivated diff --git a/src/llamacpp-api.ts b/src/llamacpp-api.ts index bbc67e4..641a738 100644 --- a/src/llamacpp-api.ts +++ b/src/llamacpp-api.ts @@ -1,83 +1,91 @@ import * as vscode from 'vscode'; - +import './common'; +import { + FetchErrorCause, + ResponseData, + showMessageWithTimeout, + showPendingStatusBar, +} from './common'; // llama.cpp server response format -export type llamaData = { - content: string, - generation_settings: JSON, - model: string, - prompt: string, - stopped_eos: boolean, - stopped_limit: boolean, - stopped_word: boolean, - stopping_word: string, +export type LlamaData = { + content: string; + generation_settings: JSON; + model: string; + prompt: string; + stopped_eos: boolean; + stopped_limit: boolean; + stopped_word: boolean; + stopping_word: string; timings: { - predicted_ms: number, - predicted_n: number, - predicted_per_second: number, - predicted_per_token_ms: number, - prompt_ms: number, - prompt_n: number, - prompt_per_second: number, - prompt_per_token_ms: number - }, - tokens_cached: number, - tokens_evaluated: number, - tokens_predicted: number, - truncated: boolean + predicted_ms: number; + predicted_n: number; + predicted_per_second: number; + predicted_per_token_ms: number; + prompt_ms: number; + prompt_n: number; + prompt_per_second: number; + prompt_per_token_ms: number; + }; + tokens_cached: number; + tokens_evaluated: number; + tokens_predicted: number; + truncated: boolean; }; -export type llamaRequest = { - n_predict: number, - mirostat: number, - repeat_penalty: number, - frequency_penalty: number, - presence_penalty: number, - repeat_last_n: number, - temperature: number, - top_p: number, - top_k: number, - typical_p: number, - tfs_z: number, - seed: number, - stream: boolean, - cache_prompt: boolean, - prompt?: string, - input_prefix?: string, - input_suffix?: string +export type LlamaRequest = { + n_predict: number; + mirostat: number; + repeat_penalty: number; + frequency_penalty: number; + presence_penalty: number; + repeat_last_n: number; + temperature: number; + top_p: number; + top_k: number; + typical_p: number; + tfs_z: number; + seed: number; + stream: boolean; + cache_prompt: boolean; + prompt?: string; + input_prefix?: string; + input_suffix?: string; }; - -export function createLlamacppRequest(config: vscode.WorkspaceConfiguration, doc_before: string, doc_after: string): llamaRequest -{ - let request: llamaRequest = { - n_predict: config.get("llamaMaxtokens") as number, - mirostat: config.get("llamaMirostat") as number, - repeat_penalty: config.get("llamaRepeatPenalty") as number, - frequency_penalty: config.get("llamaFrequencyPenalty,") as number, - presence_penalty: config.get("llamaPresencePenalty,") as number, - repeat_last_n: config.get("llamaRepeatCtx,") as number, - temperature: config.get("llamaTemperature") as number, - top_p: config.get("llamaTop_p") as number, - top_k: config.get("llamaTop_k") as number, - typical_p: config.get("llamaTypical_p") as number, - tfs_z: config.get("llamaTailfree_z,") as number, - seed: config.get("llamaSeed") as number, +export function createLlamacppRequest( + config: vscode.WorkspaceConfiguration, + doc_before: string, + doc_after: string +): LlamaRequest { + let request: LlamaRequest = { + n_predict: config.get('llamaMaxtokens') as number, + mirostat: config.get('llamaMirostat') as number, + repeat_penalty: config.get('llamaRepeatPenalty') as number, + frequency_penalty: config.get('llamaFrequencyPenalty,') as number, + presence_penalty: config.get('llamaPresencePenalty,') as number, + repeat_last_n: config.get('llamaRepeatCtx,') as number, + temperature: config.get('llamaTemperature') as number, + top_p: config.get('llamaTop_p') as number, + top_k: config.get('llamaTop_k') as number, + typical_p: config.get('llamaTypical_p') as number, + tfs_z: config.get('llamaTailfree_z,') as number, + seed: config.get('llamaSeed') as number, stream: false, - cache_prompt: config.get("llamaCachePrompt") as boolean + cache_prompt: config.get('llamaCachePrompt') as boolean, }; - const fim = config.get("fimEnabled") as boolean; - const fimRequest = config.get("useFillInMiddleRequest") as boolean; + const fim = config.get('fimEnabled') as boolean; + const fimRequest = config.get('useFillInMiddleRequest') as boolean; if (fim === true) { if (fimRequest === true) { request.input_prefix = doc_before; request.input_suffix = doc_after; } else { - const fim_beg = config.get("fimBeginString") as string; - const fim_hole = config.get("fimHoleString") as string; - const fim_end = config.get("fimEndString") as string; + const fim_beg = config.get('fimBeginString') as string; + const fim_hole = config.get('fimHoleString') as string; + const fim_end = config.get('fimEndString') as string; request.prompt = fim_beg + doc_before + fim_hole + doc_after + fim_end; } } else { @@ -87,11 +95,10 @@ export function createLlamacppRequest(config: vscode.WorkspaceConfiguration, doc return request; } -export function llamacppRequestEndpoint(config: vscode.WorkspaceConfiguration): string -{ - const fim = config.get("fimEnabled") as boolean; - const fimRequest = config.get("useFillInMiddleRequest") as boolean; - let req_str: string = config.get("llamaHost") as string; +export function llamacppRequestEndpoint(config: vscode.WorkspaceConfiguration): string { + const fim = config.get('fimEnabled') as boolean; + const fimRequest = config.get('useFillInMiddleRequest') as boolean; + let req_str: string = config.get('llamaHost') as string; if (fim === true && fimRequest === true) { req_str += '/infill'; @@ -100,4 +107,50 @@ export function llamacppRequestEndpoint(config: vscode.WorkspaceConfiguration): } return req_str; -} \ No newline at end of file +} + +export async function llamacppMakeRequest( + request: LlamaRequest, + endpoint: string +): Promise { + let ret: ResponseData = { + content: '', + tokens: 0, + time: 0, + }; + let data: LlamaData; + // try to send the request to the running server + try { + const response_promise = fetch(endpoint, { + method: 'POST', + headers: { + 'content-type': 'application/json; charset=UTF-8', + }, + body: JSON.stringify(request), + }); + + showPendingStatusBar('dumbpilot waiting', response_promise); + const response = await response_promise; + if (response.ok === false) { + throw new Error('llama server request is not ok??'); + } + + data = (await response.json()) as LlamaData; + const gen_tokens = data.timings.predicted_n; + const gen_time = (data.timings.predicted_ms / 1000).toFixed(2); + showMessageWithTimeout(`predicted ${gen_tokens} tokens in ${gen_time} seconds`, 1500); + + ret.content = data.content; + ret.tokens = data.tokens_predicted; + ret.time = data.timings.predicted_ms / 1000; + } catch (e: any) { + const err = e as TypeError; + const cause: FetchErrorCause = err.cause as FetchErrorCause; + const estr: string = + err.message + ' ' + cause.code + ' at ' + cause.address + ':' + cause.port; + // let the user know something went wrong + // TODO: maybe add a retry action or something + showMessageWithTimeout('dumbpilot error: ' + estr, 3000); + } + return ret; +} diff --git a/src/openai-api.ts b/src/openai-api.ts index e69de29..fdaad9b 100644 --- a/src/openai-api.ts +++ b/src/openai-api.ts @@ -0,0 +1,116 @@ +import * as vscode from 'vscode'; + +// oogabooga/text-generation-webui OpenAI compatible API +// https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API + +type OpenAICompletionRequest = { + model?: string; // automatic + prompt: string; + best_of?: number; // 1 + echo?: boolean; // false + frequency_penalty?: number; // null + logit_bias?: object; // null + logprobs?: number; // 0 + max_tokens?: number; // 16 + n?: number; // 1 + presence_penalty?: number; // 0 + stop?: string; + stream?: boolean; // false + suffix?: string; + temperature?: number; // 1 + top_p?: number; // 1 + user?: string; + preset?: string; + min_p?: number; // 1 + top_k?: number; // 1 + repetition_penalty?: number; // 1 + repetition_penalty_range?: number; // 1024 + typical_p?: number; // 1 + tfs?: number; // 1 + top_a?: number; // 0 + epsilon_cutoff?: number; // 0 + eta_cutoff?: number; // 0 + guidance_scale?: number; // 1 + negative_prompt?: string; // "" + penalty_alpha?: number; // 0 + mirostat_mode?: number; // 0 + mirostat_tau?: number; // 5 + mirostat_eta?: number; // 0.1 + temperature_last?: boolean; // false + do_sample?: boolean; // true + seed?: number; // -1 + encoder_repetition_penalty?: number; // 1 + no_repeat_ngram_size?: number; // 0 + min_length?: number; // 0 + num_beams?: number; // 1 + length_penalty?: number; // 1 + early_stopping?: boolean; // false + truncation_length?: number; // 0 + max_tokens_second?: number; // 0 + custom_token_bans?: string; // "" + auto_max_new_tokens?: boolean; // false + ban_eos_token?: boolean; // false + add_bos_token?: boolean; // true + skip_special_tokens?: boolean; // true + grammar_string?: string; // '' +}; + +type OpenAICompletionSuccessResponse = { + id: string; + choices: object[]; + created?: number; + model: string; + object?: string; + usage: object; +}; + +type OpenAICompletionFailureResponse = { + detail: { + loc: (string | number)[]; + msg: string; + type: string; + }[]; +}; + +type OpenAICompletionResponse = OpenAICompletionSuccessResponse | OpenAICompletionFailureResponse; + +export function createOpenAIAPIRequest( + config: vscode.WorkspaceConfiguration, + doc_before: string, + doc_after: string +): OpenAICompletionRequest { + let request: OpenAICompletionRequest = { + prompt: '', + max_tokens: config.get('llamaMaxtokens') as number, + mirostat_mode: config.get('llamaMirostat') as number, + repetition_penalty: config.get('llamaRepeatPenalty') as number, + frequency_penalty: config.get('llamaFrequencyPenalty,') as number, + presence_penalty: config.get('llamaPresencePenalty,') as number, + repetition_penalty_range: config.get('llamaRepeatCtx,') as number, + temperature: config.get('llamaTemperature') as number, + top_p: config.get('llamaTop_p') as number, + top_k: config.get('llamaTop_k') as number, + typical_p: config.get('llamaTypical_p') as number, + tfs: config.get('llamaTailfree_z,') as number, + seed: config.get('llamaSeed') as number, + stream: false, + }; + + const fim = config.get('fimEnabled') as boolean; + + if (fim === true) { + const fim_beg = config.get('fimBeginString') as string; + const fim_hole = config.get('fimHoleString') as string; + const fim_end = config.get('fimEndString') as string; + request.prompt = fim_beg + doc_before + fim_hole + doc_after + fim_end; + } else { + request.prompt = doc_before; + } + + return request; +} + +// for now only completions is implemented +export function OpenAIAPIRequestEndpoint(config: vscode.WorkspaceConfiguration): string { + return '/v1/completions'; +}