diff --git a/TODO.md b/TODO.md
index 5d66f9d..cae975b 100644
--- a/TODO.md
+++ b/TODO.md
@@ -7,7 +7,9 @@
 [] - add a chat window
 [] - if the model is an instruct-type add the system prompt to the chat
 [x] - add an icon
-[] - option to backup and restore model settings
 [] - add a window to quickly modify model configs
+[] - sliders for context lengths and generated buttons
+[] - option to backup and restore model settings as JSON
 [] - decorate ai generated text https://github.com/microsoft/vscode-extension-samples/tree/main/decorator-sample
 [] - when trying to use completion when there is an active selection either substitute the selection or use the selection as context instead of the whole file
+[] - when in stream mode, print the tokens as they arrive in the status bar as an additional indication that the model is generating and to quickly check if the model has entered a loop
diff --git a/package.json b/package.json
index e7db402..0fa9e4a 100644
--- a/package.json
+++ b/package.json
@@ -238,6 +238,10 @@
 					"type": "string",
 					"default": "deepseek-coder-6.7B-base.gguf",
 					"description": "Name of the model to use, only works in OpenAI API mode"
+				},
+				"dumbpilot.llamaAPIStream": {
+					"type": "boolean",
+					"default": false
 				}
 			}
 		}
diff --git a/src/extension.ts b/src/extension.ts
index 1cc5355..88a8196 100644
--- a/src/extension.ts
+++ b/src/extension.ts
@@ -105,6 +105,7 @@ export function activate(context: vscode.ExtensionContext) {
 
 			// actially make the request
 			let data: ResponseData = { content: '', tokens: 0, time: 0 };
+			let promise: Promise<ResponseData>;
 			if (config.get('llamaUseOpenAIAPI') === true) {
 				const request: OpenAICompletionRequest = createOpenAIAPIRequest(
 					config,
@@ -112,13 +113,20 @@ export function activate(context: vscode.ExtensionContext) {
 					doc_after
 				);
 				const endpoint: string = openAIAPIRequestEndpoint(config);
-				data = await openAIMakeRequest(request, endpoint);
+				promise = openAIMakeRequest(request, endpoint);
 			} else {
 				const request: LlamaRequest = createLlamacppRequest(config, doc_before, doc_after);
 				const endpoint: string = llamacppRequestEndpoint(config);
-				data = await llamacppMakeRequest(request, endpoint);
+				promise = llamacppMakeRequest(request, endpoint);
 			}
 
+			showPendingStatusBar('dumbpilot waiting', promise);
+			data = await promise;
+			showMessageWithTimeout(
+				`predicted ${data.tokens} tokens in ${data.time.toFixed(2)} seconds`,
+				1500
+			);
+
 			result.items.push({
 				insertText: data.content,
 				range: new vscode.Range(position, position),
diff --git a/src/llamacpp-api.ts b/src/llamacpp-api.ts
index 641a738..28756c4 100644
--- a/src/llamacpp-api.ts
+++ b/src/llamacpp-api.ts
@@ -129,7 +129,6 @@ export async function llamacppMakeRequest(
 			body: JSON.stringify(request),
 		});
 
-		showPendingStatusBar('dumbpilot waiting', response_promise);
 		const response = await response_promise;
 		if (response.ok === false) {
 			throw new Error('llama server request is not ok??');
@@ -138,7 +137,6 @@ export async function llamacppMakeRequest(
 		data = (await response.json()) as LlamaData;
 		const gen_tokens = data.timings.predicted_n;
 		const gen_time = (data.timings.predicted_ms / 1000).toFixed(2);
-		showMessageWithTimeout(`predicted ${gen_tokens} tokens in ${gen_time} seconds`, 1500);
 
 		ret.content = data.content;
 		ret.tokens = data.tokens_predicted;
diff --git a/src/openai-api.ts b/src/openai-api.ts
index 7722ad7..b3227be 100644
--- a/src/openai-api.ts
+++ b/src/openai-api.ts
@@ -5,6 +5,7 @@ import {
 	showMessageWithTimeout,
 	showPendingStatusBar,
 } from './common';
+import { config } from 'process';
 
 // oogabooga/text-generation-webui OpenAI compatible API
 // https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API
@@ -73,7 +74,7 @@ type OpenAICompletionSuccessResponse = {
 	model: string;
 	object?: string;
 	usage: {
-		completion_tokens: number;
+		completion_tokens?: number;
 		prompt_tokens: number;
 		total_tokens: number;
 	};
@@ -108,7 +109,7 @@ export function createOpenAIAPIRequest(
 		typical_p: config.get('llamaTypical_p') as number,
 		tfs: config.get('llamaTailfree_z,') as number,
 		seed: config.get('llamaSeed') as number,
-		stream: false,
+		stream: config.get('llamaAPIStream'),
 	};
 
 	const fim = config.get('fimEnabled') as boolean;
@@ -131,7 +132,7 @@ export function openAIAPIRequestEndpoint(config: vscode.WorkspaceConfiguration):
 }
 
 export async function openAIMakeRequest(
-	request: OpenAICompletionRequest,
+	request_body: OpenAICompletionRequest,
 	endpoint: string
 ): Promise<ResponseData> {
 	let ret: ResponseData = {
@@ -140,45 +141,67 @@ export async function openAIMakeRequest(
 		time: 0,
 	};
 	let data: OpenAICompletionResponse;
+	const is_stream: boolean = request_body.stream === true ? true : false;
+
+	// format the request
+	const request: RequestInit = {
+		method: 'POST',
+		headers: {
+			'content-type': 'application/json; charset=UTF-8',
+		},
+		body: JSON.stringify(request_body),
+	};
+
 	// try to send the request to the running server
 	try {
-		const response_promise = fetch(endpoint, {
-			method: 'POST',
-			headers: {
-				'content-type': 'application/json; charset=UTF-8',
-			},
-			body: JSON.stringify(request),
-		});
-
-		showPendingStatusBar('dumbpilot waiting', response_promise);
-
-		//	TODO: measure the time it takes the server to respond
-		let resp_time: number = 0;
+		const response_promise = fetch(endpoint, request);
+
+		// if doing a stream request we have to attach a reader and join
+		// the individual responses
 		const response = await response_promise;
 
-		if (response.ok === false) {
-			throw new Error('llama server request is not ok??');
+		// read the data chunk by chunk using asynchronous iteration
+		if (response.body === null) {
+			throw new Error('null response body');
 		}
 
-		data = (await response.json()) as OpenAICompletionResponse;
-
-		// check wether the remote gave back an error
-		if (Object.hasOwn(data, 'detail') === true) {
-			data = data as OpenAICompletionFailureResponse;
-			// TODO: why did it error?
-			throw new Error('OpenAI Endpoint Error');
+		// start a timer
+		const timer_start = performance.now();
+
+		for await (const chunk of response.body) {
+			// FIXME: why the fuck do I have to do this shite
+			let data_text = new TextDecoder().decode(chunk);
+			data_text = data_text.substring(data_text.indexOf('{'));
+			let data: OpenAICompletionResponse;
+
+			try {
+				data = JSON.parse(data_text);
+			} catch (e: any) {
+				console.error(e);
+				return ret;
+			}
+			//console.log(JSON.stringify(data));
+
+			if (Object.hasOwn(data, 'detail') === true) {
+				data = data as OpenAICompletionFailureResponse;
+				// TODO: why did it error?
+				throw new Error('OpenAI Endpoint Error');
+			}
+
+			// unpack the data
+			data = data as OpenAICompletionSuccessResponse;
+			// FIXME: why the choices may be multiple?
+			// TODO: display the multiple choices
+			//console.log(data.choices[0].text);
+			ret.content += data.choices[0].text;
+			ret.tokens += data.usage?.completion_tokens || 0;
 		}
 
-		// unpack the data
-		data = data as OpenAICompletionSuccessResponse;
-		// FIXME: why the choices may be multiple?
-		// TODO: display the multiple choices
-		ret.content = data.choices[0].text;
-		ret.tokens = data.usage.completion_tokens;
-		ret.time = resp_time;
-
-		showMessageWithTimeout(`predicted ${ret.tokens} tokens in ${ret.time} seconds`, 1500);
+		// stop the timer
+		const timer_end = performance.now();
+		ret.time = (timer_end - timer_start) / 1000.0;
 	} catch (e: any) {
+		console.error(e);
 		const err = e as TypeError;
 		const cause: FetchErrorCause = err.cause as FetchErrorCause;
 		const estr: string =