added streaming support

master
Alessandro Mauri 11 months ago
parent fe68daac38
commit b75dce72cd
  1. 4
      TODO.md
  2. 4
      package.json
  3. 12
      src/extension.ts
  4. 2
      src/llamacpp-api.ts
  5. 89
      src/openai-api.ts

@ -7,7 +7,9 @@
[] - add a chat window [] - add a chat window
[] - if the model is an instruct-type add the system prompt to the chat [] - if the model is an instruct-type add the system prompt to the chat
[x] - add an icon [x] - add an icon
[] - option to backup and restore model settings
[] - add a window to quickly modify model configs [] - add a window to quickly modify model configs
[] - sliders for context lengths and generated buttons
[] - option to backup and restore model settings as JSON
[] - decorate ai generated text https://github.com/microsoft/vscode-extension-samples/tree/main/decorator-sample [] - decorate ai generated text https://github.com/microsoft/vscode-extension-samples/tree/main/decorator-sample
[] - when trying to use completion when there is an active selection either substitute the selection or use the selection as context instead of the whole file [] - when trying to use completion when there is an active selection either substitute the selection or use the selection as context instead of the whole file
[] - when in stream mode, print the tokens as they arrive in the status bar as an additional indication that the model is generating and to quickly check if the model has entered a loop

@ -238,6 +238,10 @@
"type": "string", "type": "string",
"default": "deepseek-coder-6.7B-base.gguf", "default": "deepseek-coder-6.7B-base.gguf",
"description": "Name of the model to use, only works in OpenAI API mode" "description": "Name of the model to use, only works in OpenAI API mode"
},
"dumbpilot.llamaAPIStream": {
"type": "boolean",
"default": false
} }
} }
} }

@ -105,6 +105,7 @@ export function activate(context: vscode.ExtensionContext) {
// actially make the request // actially make the request
let data: ResponseData = { content: '', tokens: 0, time: 0 }; let data: ResponseData = { content: '', tokens: 0, time: 0 };
let promise: Promise<ResponseData>;
if (config.get('llamaUseOpenAIAPI') === true) { if (config.get('llamaUseOpenAIAPI') === true) {
const request: OpenAICompletionRequest = createOpenAIAPIRequest( const request: OpenAICompletionRequest = createOpenAIAPIRequest(
config, config,
@ -112,13 +113,20 @@ export function activate(context: vscode.ExtensionContext) {
doc_after doc_after
); );
const endpoint: string = openAIAPIRequestEndpoint(config); const endpoint: string = openAIAPIRequestEndpoint(config);
data = await openAIMakeRequest(request, endpoint); promise = openAIMakeRequest(request, endpoint);
} else { } else {
const request: LlamaRequest = createLlamacppRequest(config, doc_before, doc_after); const request: LlamaRequest = createLlamacppRequest(config, doc_before, doc_after);
const endpoint: string = llamacppRequestEndpoint(config); const endpoint: string = llamacppRequestEndpoint(config);
data = await llamacppMakeRequest(request, endpoint); promise = llamacppMakeRequest(request, endpoint);
} }
showPendingStatusBar('dumbpilot waiting', promise);
data = await promise;
showMessageWithTimeout(
`predicted ${data.tokens} tokens in ${data.time.toFixed(2)} seconds`,
1500
);
result.items.push({ result.items.push({
insertText: data.content, insertText: data.content,
range: new vscode.Range(position, position), range: new vscode.Range(position, position),

@ -129,7 +129,6 @@ export async function llamacppMakeRequest(
body: JSON.stringify(request), body: JSON.stringify(request),
}); });
showPendingStatusBar('dumbpilot waiting', response_promise);
const response = await response_promise; const response = await response_promise;
if (response.ok === false) { if (response.ok === false) {
throw new Error('llama server request is not ok??'); throw new Error('llama server request is not ok??');
@ -138,7 +137,6 @@ export async function llamacppMakeRequest(
data = (await response.json()) as LlamaData; data = (await response.json()) as LlamaData;
const gen_tokens = data.timings.predicted_n; const gen_tokens = data.timings.predicted_n;
const gen_time = (data.timings.predicted_ms / 1000).toFixed(2); const gen_time = (data.timings.predicted_ms / 1000).toFixed(2);
showMessageWithTimeout(`predicted ${gen_tokens} tokens in ${gen_time} seconds`, 1500);
ret.content = data.content; ret.content = data.content;
ret.tokens = data.tokens_predicted; ret.tokens = data.tokens_predicted;

@ -5,6 +5,7 @@ import {
showMessageWithTimeout, showMessageWithTimeout,
showPendingStatusBar, showPendingStatusBar,
} from './common'; } from './common';
import { config } from 'process';
// oogabooga/text-generation-webui OpenAI compatible API // oogabooga/text-generation-webui OpenAI compatible API
// https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API // https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API
@ -73,7 +74,7 @@ type OpenAICompletionSuccessResponse = {
model: string; model: string;
object?: string; object?: string;
usage: { usage: {
completion_tokens: number; completion_tokens?: number;
prompt_tokens: number; prompt_tokens: number;
total_tokens: number; total_tokens: number;
}; };
@ -108,7 +109,7 @@ export function createOpenAIAPIRequest(
typical_p: config.get('llamaTypical_p') as number, typical_p: config.get('llamaTypical_p') as number,
tfs: config.get('llamaTailfree_z,') as number, tfs: config.get('llamaTailfree_z,') as number,
seed: config.get('llamaSeed') as number, seed: config.get('llamaSeed') as number,
stream: false, stream: config.get('llamaAPIStream'),
}; };
const fim = config.get('fimEnabled') as boolean; const fim = config.get('fimEnabled') as boolean;
@ -131,7 +132,7 @@ export function openAIAPIRequestEndpoint(config: vscode.WorkspaceConfiguration):
} }
export async function openAIMakeRequest( export async function openAIMakeRequest(
request: OpenAICompletionRequest, request_body: OpenAICompletionRequest,
endpoint: string endpoint: string
): Promise<ResponseData> { ): Promise<ResponseData> {
let ret: ResponseData = { let ret: ResponseData = {
@ -140,45 +141,67 @@ export async function openAIMakeRequest(
time: 0, time: 0,
}; };
let data: OpenAICompletionResponse; let data: OpenAICompletionResponse;
const is_stream: boolean = request_body.stream === true ? true : false;
// format the request
const request: RequestInit = {
method: 'POST',
headers: {
'content-type': 'application/json; charset=UTF-8',
},
body: JSON.stringify(request_body),
};
// try to send the request to the running server // try to send the request to the running server
try { try {
const response_promise = fetch(endpoint, { const response_promise = fetch(endpoint, request);
method: 'POST',
headers: { // if doing a stream request we have to attach a reader and join
'content-type': 'application/json; charset=UTF-8', // the individual responses
},
body: JSON.stringify(request),
});
showPendingStatusBar('dumbpilot waiting', response_promise);
// TODO: measure the time it takes the server to respond
let resp_time: number = 0;
const response = await response_promise; const response = await response_promise;
if (response.ok === false) { // read the data chunk by chunk using asynchronous iteration
throw new Error('llama server request is not ok??'); if (response.body === null) {
throw new Error('null response body');
} }
data = (await response.json()) as OpenAICompletionResponse; // start a timer
const timer_start = performance.now();
// check wether the remote gave back an error
if (Object.hasOwn(data, 'detail') === true) { for await (const chunk of response.body) {
data = data as OpenAICompletionFailureResponse; // FIXME: why the fuck do I have to do this shite
// TODO: why did it error? let data_text = new TextDecoder().decode(chunk);
throw new Error('OpenAI Endpoint Error'); data_text = data_text.substring(data_text.indexOf('{'));
let data: OpenAICompletionResponse;
try {
data = JSON.parse(data_text);
} catch (e: any) {
console.error(e);
return ret;
}
//console.log(JSON.stringify(data));
if (Object.hasOwn(data, 'detail') === true) {
data = data as OpenAICompletionFailureResponse;
// TODO: why did it error?
throw new Error('OpenAI Endpoint Error');
}
// unpack the data
data = data as OpenAICompletionSuccessResponse;
// FIXME: why the choices may be multiple?
// TODO: display the multiple choices
//console.log(data.choices[0].text);
ret.content += data.choices[0].text;
ret.tokens += data.usage?.completion_tokens || 0;
} }
// unpack the data // stop the timer
data = data as OpenAICompletionSuccessResponse; const timer_end = performance.now();
// FIXME: why the choices may be multiple? ret.time = (timer_end - timer_start) / 1000.0;
// TODO: display the multiple choices
ret.content = data.choices[0].text;
ret.tokens = data.usage.completion_tokens;
ret.time = resp_time;
showMessageWithTimeout(`predicted ${ret.tokens} tokens in ${ret.time} seconds`, 1500);
} catch (e: any) { } catch (e: any) {
console.error(e);
const err = e as TypeError; const err = e as TypeError;
const cause: FetchErrorCause = err.cause as FetchErrorCause; const cause: FetchErrorCause = err.cause as FetchErrorCause;
const estr: string = const estr: string =

Loading…
Cancel
Save