diff --git a/package.json b/package.json
index d7d2e9c..eb8bd8c 100644
--- a/package.json
+++ b/package.json
@@ -153,7 +153,12 @@
                 "dumbpilot.llamaTop_k": {"type": "number", "default": 40}, 
                 "dumbpilot.llamaTypical_p": {"type": "number", "default": 0.95}, 
                 "dumbpilot.llamaTailfree_z": {"type": "number", "default": 0.5}, 
-                "dumbpilot.llamaSeed": {"type": "number", "default": -1}
+                "dumbpilot.llamaSeed": {"type": "number", "default": -1},
+                "dumbpilot.llamaCachePrompt": {
+                    "type": "bool",
+                    "default": true,
+                    "description": "Enable prompt caching for faster results"
+                }
             }
         }
     },
diff --git a/src/extension.ts b/src/extension.ts
index 1a32283..7f9c24a 100644
--- a/src/extension.ts
+++ b/src/extension.ts
@@ -29,7 +29,7 @@ type llamaData = {
 	truncated: boolean
 };
 
-type llamaCompletionRequest = {
+type llamaRequest = {
 	n_predict: number,
 	mirostat: number,
 	repeat_penalty: number,
@@ -43,25 +43,10 @@ type llamaCompletionRequest = {
 	tfs_z: number, 
 	seed: number,
 	stream: boolean,
-	prompt: string,
-};
-
-type llamaFillRequest = {
-	n_predict: number,
-	mirostat: number,
-	repeat_penalty: number,
-	frequency_penalty: number, 
-	presence_penalty: number, 
-	repeat_last_n: number, 
-	temperature: number,
-	top_p: number,
-	top_k: number,
-	typical_p: number,
-	tfs_z: number, 
-	seed: number,
-	stream: boolean,
-	input_prefix: string,
-	input_suffix: string
+	cache_prompt: boolean,
+	prompt?: string,
+	input_prefix?: string,
+	input_suffix?: string
 };
 
 type fetchErrorCause = {
@@ -118,7 +103,13 @@ export function activate(context: vscode.ExtensionContext) {
 
 	console.log('dumbpilot is now active');
 
-	const config = vscode.workspace.getConfiguration("dumbpilot");
+	let config = vscode.workspace.getConfiguration("dumbpilot");
+
+	// handle completion changes
+	context.subscriptions.push(vscode.workspace.onDidChangeConfiguration(e => {
+		config = vscode.workspace.getConfiguration("dumbpilot");
+	}));
+
 	let completion_enabled: boolean = config.get("completionEnabled") as boolean;
 
 	// TODO: work with local configurations
@@ -183,8 +174,9 @@ export function activate(context: vscode.ExtensionContext) {
 			// FIXME: is there a more efficient way?
 			doc_before = pfx + ' ' + fname + sfx + '\n' + doc_before;
 
-			// server request object
-			const request: llamaCompletionRequest = {
+			const fim = config.get("fimEnabled") as boolean;
+			let req_str: string;
+			let request: llamaRequest = {
 				n_predict: config.get("llamaMaxtokens") as number,
 				mirostat: config.get("llamaMirostat") as number,
 				repeat_penalty: config.get("llamaRepeatPenalty") as number,
@@ -198,14 +190,24 @@ export function activate(context: vscode.ExtensionContext) {
 				tfs_z: config.get("llamaTailfree_z,") as number,
 				seed: config.get("llamaSeed") as number,
 				stream: false,
-				prompt: doc_before,
+				cache_prompt: config.get("llamaCachePrompt") as boolean
 			};
-
+			
+			if (fim === true) {
+				req_str = '/infill';
+				request.input_prefix = doc_before;
+				request.input_suffix = doc_after;
+			} else {
+				req_str = '/completion';
+				request.prompt = doc_before;
+			}
+			console.log(fim);
+			
 			let data: llamaData;
 			// try to send the request to the running server
 			try {
 				const response_promise = fetch(
-					(config.get("llamaHost") as string).concat('/completion'),
+					(config.get("llamaHost") as string).concat(req_str),
 					{
 						method: 'POST',
 						headers: {