2025/04/07 20:17:09 routes.go:1231: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_CONTEXT_LENGTH:2048 OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CACHE_TYPE: OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:C:\\Users\\jamie\\.ollama\\models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NEW_ENGINE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://* vscode-file://*] OLLAMA_SCHED_SPREAD:false ROCR_VISIBLE_DEVICES:]" time=2025-04-07T20:17:09.024-06:00 level=INFO source=images.go:458 msg="total blobs: 19" time=2025-04-07T20:17:09.024-06:00 level=INFO source=images.go:465 msg="total unused blobs removed: 0" time=2025-04-07T20:17:09.024-06:00 level=INFO source=routes.go:1298 msg="Listening on 127.0.0.1:11434 (version 0.6.5)" time=2025-04-07T20:17:09.024-06:00 level=INFO source=gpu.go:217 msg="looking for compatible GPUs" time=2025-04-07T20:17:09.024-06:00 level=INFO source=gpu_windows.go:167 msg=packages count=1 time=2025-04-07T20:17:09.024-06:00 level=INFO source=gpu_windows.go:183 msg="efficiency cores detected" maxEfficiencyClass=1 time=2025-04-07T20:17:09.024-06:00 level=INFO source=gpu_windows.go:214 msg="" package=0 cores=24 efficiency=16 threads=32 time=2025-04-07T20:17:09.155-06:00 level=INFO source=types.go:130 msg="inference compute" id=GPU-463e373a-2b0b-5c5a-a5e7-bf8128659ba1 library=cuda variant=v12 compute=8.9 driver=12.8 name="NVIDIA GeForce RTX 4070 Ti SUPER" total="16.0 GiB" available="14.7 GiB" [GIN] 2025/04/07 - 20:18:04 | 200 | 0s | 127.0.0.1 | HEAD "/" [GIN] 2025/04/07 - 20:18:04 | 500 | 0s | 127.0.0.1 | POST "/api/show" [GIN] 2025/04/07 - 20:18:11 | 200 | 0s | 127.0.0.1 | HEAD "/" [GIN] 2025/04/07 - 20:18:11 | 200 | 48.2181ms | 127.0.0.1 | POST "/api/show" time=2025-04-07T20:18:11.961-06:00 level=INFO source=sched.go:716 msg="new model will fit in available VRAM in single GPU, loading" model=C:\Users\jamie\.ollama\models\blobs\sha256-e8ad13eff07a78d89926e9e8b882317d082ef5bf9768ad7b50fcdbbcd63748de gpu=GPU-463e373a-2b0b-5c5a-a5e7-bf8128659ba1 parallel=4 available=15600865280 required="11.6 GiB" time=2025-04-07T20:18:11.974-06:00 level=INFO source=server.go:105 msg="system memory" total="63.8 GiB" free="52.0 GiB" free_swap="49.9 GiB" time=2025-04-07T20:18:11.975-06:00 level=INFO source=server.go:138 msg=offload library=cuda layers.requested=-1 layers.model=49 layers.offload=49 layers.split="" memory.available="[14.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="11.6 GiB" memory.required.partial="11.6 GiB" memory.required.kv="1.9 GiB" memory.required.allocations="[11.6 GiB]" memory.weights.total="6.8 GiB" memory.weights.repeating="6.0 GiB" memory.weights.nonrepeating="787.5 MiB" memory.graph.full="519.5 MiB" memory.graph.partial="1.3 GiB" projector.weights="795.9 MiB" projector.graph="1.0 GiB" time=2025-04-07T20:18:12.029-06:00 level=WARN source=ggml.go:152 msg="key not found" key=tokenizer.ggml.add_eot_token default=false time=2025-04-07T20:18:12.033-06:00 level=WARN source=ggml.go:152 msg="key not found" key=gemma3.attention.layer_norm_rms_epsilon default=9.999999974752427e-07 time=2025-04-07T20:18:12.034-06:00 level=WARN source=ggml.go:152 msg="key not found" key=gemma3.rope.local.freq_base default=10000 time=2025-04-07T20:18:12.034-06:00 level=WARN source=ggml.go:152 msg="key not found" key=gemma3.rope.global.freq_base default=1e+06 time=2025-04-07T20:18:12.034-06:00 level=WARN source=ggml.go:152 msg="key not found" key=gemma3.rope.freq_scale default=1 time=2025-04-07T20:18:12.034-06:00 level=WARN source=ggml.go:152 msg="key not found" key=gemma3.mm_tokens_per_image default=256 time=2025-04-07T20:18:12.036-06:00 level=INFO source=server.go:405 msg="starting llama server" cmd="C:\\Users\\jamie\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --ollama-engine --model C:\\Users\\jamie\\.ollama\\models\\blobs\\sha256-e8ad13eff07a78d89926e9e8b882317d082ef5bf9768ad7b50fcdbbcd63748de --ctx-size 8192 --batch-size 512 --n-gpu-layers 49 --threads 8 --no-mmap --parallel 4 --port 64157" time=2025-04-07T20:18:12.037-06:00 level=INFO source=sched.go:451 msg="loaded runners" count=1 time=2025-04-07T20:18:12.037-06:00 level=INFO source=server.go:580 msg="waiting for llama runner to start responding" time=2025-04-07T20:18:12.038-06:00 level=INFO source=server.go:614 msg="waiting for server to become available" status="llm server error" time=2025-04-07T20:18:12.049-06:00 level=INFO source=runner.go:816 msg="starting ollama engine" time=2025-04-07T20:18:12.050-06:00 level=INFO source=runner.go:879 msg="Server listening on 127.0.0.1:64157" time=2025-04-07T20:18:12.101-06:00 level=WARN source=ggml.go:152 msg="key not found" key=general.name default="" time=2025-04-07T20:18:12.101-06:00 level=WARN source=ggml.go:152 msg="key not found" key=general.description default="" time=2025-04-07T20:18:12.101-06:00 level=INFO source=ggml.go:67 msg="" architecture=gemma3 file_type=Q4_K_M name="" description="" num_tensors=1065 num_key_values=37 ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 CUDA devices: Device 0: NVIDIA GeForce RTX 4070 Ti SUPER, compute capability 8.9, VMM: yes load_backend: loaded CUDA backend from C:\Users\jamie\AppData\Local\Programs\Ollama\lib\ollama\cuda_v12\ggml-cuda.dll load_backend: loaded CPU backend from C:\Users\jamie\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-alderlake.dll time=2025-04-07T20:18:12.161-06:00 level=INFO source=ggml.go:109 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX_VNNI=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang) time=2025-04-07T20:18:12.224-06:00 level=INFO source=ggml.go:289 msg="model weights" buffer=CUDA0 size="7.6 GiB" time=2025-04-07T20:18:12.224-06:00 level=INFO source=ggml.go:289 msg="model weights" buffer=CPU size="787.5 MiB" time=2025-04-07T20:18:12.289-06:00 level=INFO source=server.go:614 msg="waiting for server to become available" status="llm server loading model" time=2025-04-07T20:18:13.404-06:00 level=INFO source=ggml.go:388 msg="compute graph" backend=CUDA0 buffer_type=CUDA0 time=2025-04-07T20:18:13.404-06:00 level=INFO source=ggml.go:388 msg="compute graph" backend=CPU buffer_type=CUDA_Host time=2025-04-07T20:18:13.406-06:00 level=WARN source=ggml.go:152 msg="key not found" key=tokenizer.ggml.add_eot_token default=false time=2025-04-07T20:18:13.409-06:00 level=WARN source=ggml.go:152 msg="key not found" key=gemma3.attention.layer_norm_rms_epsilon default=9.999999974752427e-07 time=2025-04-07T20:18:13.409-06:00 level=WARN source=ggml.go:152 msg="key not found" key=gemma3.rope.local.freq_base default=10000 time=2025-04-07T20:18:13.409-06:00 level=WARN source=ggml.go:152 msg="key not found" key=gemma3.rope.global.freq_base default=1e+06 time=2025-04-07T20:18:13.409-06:00 level=WARN source=ggml.go:152 msg="key not found" key=gemma3.rope.freq_scale default=1 time=2025-04-07T20:18:13.409-06:00 level=WARN source=ggml.go:152 msg="key not found" key=gemma3.mm_tokens_per_image default=256 time=2025-04-07T20:18:13.542-06:00 level=INFO source=server.go:619 msg="llama runner started in 1.50 seconds" [GIN] 2025/04/07 - 20:18:13 | 200 | 1.6756389s | 127.0.0.1 | POST "/api/generate" [GIN] 2025/04/07 - 20:20:20 | 200 | 1.3864007s | 127.0.0.1 | POST "/api/chat" llama_model_loader: loaded meta data with 36 key-value pairs and 1065 tensors from C:\Users\jamie\.ollama\models\blobs\sha256-e8ad13eff07a78d89926e9e8b882317d082ef5bf9768ad7b50fcdbbcd63748de (version GGUF V3 (latest)) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: gemma3.attention.head_count u32 = 16 llama_model_loader: - kv 1: gemma3.attention.head_count_kv u32 = 8 llama_model_loader: - kv 2: gemma3.attention.key_length u32 = 256 llama_model_loader: - kv 3: gemma3.attention.sliding_window u32 = 1024 llama_model_loader: - kv 4: gemma3.attention.value_length u32 = 256 llama_model_loader: - kv 5: gemma3.block_count u32 = 48 llama_model_loader: - kv 6: gemma3.context_length u32 = 131072 llama_model_loader: - kv 7: gemma3.embedding_length u32 = 3840 llama_model_loader: - kv 8: gemma3.feed_forward_length u32 = 15360 llama_model_loader: - kv 9: gemma3.mm.tokens_per_image u32 = 256 llama_model_loader: - kv 10: gemma3.vision.attention.head_count u32 = 16 llama_model_loader: - kv 11: gemma3.vision.attention.layer_norm_epsilon f32 = 0.000001 llama_model_loader: - kv 12: gemma3.vision.block_count u32 = 27 llama_model_loader: - kv 13: gemma3.vision.embedding_length u32 = 1152 llama_model_loader: - kv 14: gemma3.vision.feed_forward_length u32 = 4304 llama_model_loader: - kv 15: gemma3.vision.image_size u32 = 896 llama_model_loader: - kv 16: gemma3.vision.num_channels u32 = 3 llama_model_loader: - kv 17: gemma3.vision.patch_size u32 = 14 llama_model_loader: - kv 18: general.architecture str = gemma3 llama_model_loader: - kv 19: tokenizer.chat_template str = {{ bos_token }}\n{%- if messages[0]['r... llama_model_loader: - kv 20: tokenizer.ggml.add_bos_token bool = true llama_model_loader: - kv 21: tokenizer.ggml.add_eos_token bool = false llama_model_loader: - kv 22: tokenizer.ggml.add_padding_token bool = false llama_model_loader: - kv 23: tokenizer.ggml.add_unknown_token bool = false llama_model_loader: - kv 24: tokenizer.ggml.bos_token_id u32 = 2 llama_model_loader: - kv 25: tokenizer.ggml.eos_token_id u32 = 1 llama_model_loader: - kv 26: tokenizer.ggml.merges arr[str,514906] = ["\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n", ... llama_model_loader: - kv 27: tokenizer.ggml.model str = llama llama_model_loader: - kv 28: tokenizer.ggml.padding_token_id u32 = 0 llama_model_loader: - kv 29: tokenizer.ggml.pre str = default llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,262145] = [0.000000, 0.000000, 0.000000, 0.0000... llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,262145] = [3, 3, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, ... llama_model_loader: - kv 32: tokenizer.ggml.tokens arr[str,262145] = ["", "", "", "", ... llama_model_loader: - kv 33: tokenizer.ggml.unknown_token_id u32 = 3 llama_model_loader: - kv 34: general.quantization_version u32 = 2 llama_model_loader: - kv 35: general.file_type u32 = 15 llama_model_loader: - type f32: 563 tensors llama_model_loader: - type f16: 165 tensors llama_model_loader: - type q4_K: 290 tensors llama_model_loader: - type q6_K: 47 tensors load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect load: special tokens cache size = 7 load: token to piece cache size = 1.9446 MB [GIN] 2025/04/07 - 20:20:48 | 200 | 2.2098885s | 127.0.0.1 | POST "/v1/chat/completions" [GIN] 2025/04/07 - 20:20:50 | 200 | 1.5308851s | 127.0.0.1 | POST "/v1/chat/completions" [GIN] 2025/04/07 - 20:20:52 | 200 | 1.510633s | 127.0.0.1 | POST "/v1/chat/completions" [GIN] 2025/04/07 - 20:20:54 | 200 | 1.6740742s | 127.0.0.1 | POST "/v1/chat/completions" [GIN] 2025/04/07 - 20:20:56 | 200 | 2.0510651s | 127.0.0.1 | POST "/v1/chat/completions" [GIN] 2025/04/07 - 20:29:17 | 200 | 0s | 127.0.0.1 | HEAD "/" [GIN] 2025/04/07 - 20:29:17 | 200 | 7.501ms | 127.0.0.1 | GET "/api/tags" [GIN] 2025/04/07 - 20:29:54 | 200 | 0s | 127.0.0.1 | HEAD "/" [GIN] 2025/04/07 - 20:29:54 | 200 | 26.1296ms | 127.0.0.1 | POST "/api/generate" [GIN] 2025/04/07 - 20:29:54 | 200 | 4.4996ms | 127.0.0.1 | DELETE "/api/delete" [GIN] 2025/04/07 - 20:30:39 | 200 | 0s | 127.0.0.1 | HEAD "/" [GIN] 2025/04/07 - 20:30:39 | 404 | 1.0003ms | 127.0.0.1 | POST "/api/show" [GIN] 2025/04/07 - 20:30:40 | 200 | 1.0144325s | 127.0.0.1 | POST "/api/pull" [GIN] 2025/04/07 - 20:30:41 | 200 | 72.9726ms | 127.0.0.1 | POST "/api/show" time=2025-04-07T20:30:41.098-06:00 level=INFO source=sched.go:716 msg="new model will fit in available VRAM in single GPU, loading" model=C:\Users\jamie\.ollama\models\blobs\sha256-e8ad13eff07a78d89926e9e8b882317d082ef5bf9768ad7b50fcdbbcd63748de gpu=GPU-463e373a-2b0b-5c5a-a5e7-bf8128659ba1 parallel=4 available=15982063616 required="11.6 GiB" time=2025-04-07T20:30:41.116-06:00 level=INFO source=server.go:105 msg="system memory" total="63.8 GiB" free="56.4 GiB" free_swap="55.6 GiB" time=2025-04-07T20:30:41.116-06:00 level=INFO source=server.go:138 msg=offload library=cuda layers.requested=-1 layers.model=49 layers.offload=49 layers.split="" memory.available="[14.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="11.6 GiB" memory.required.partial="11.6 GiB" memory.required.kv="1.9 GiB" memory.required.allocations="[11.6 GiB]" memory.weights.total="6.8 GiB" memory.weights.repeating="6.0 GiB" memory.weights.nonrepeating="787.5 MiB" memory.graph.full="519.5 MiB" memory.graph.partial="1.3 GiB" projector.weights="795.9 MiB" projector.graph="1.0 GiB" time=2025-04-07T20:30:41.176-06:00 level=WARN source=ggml.go:152 msg="key not found" key=tokenizer.ggml.add_eot_token default=false time=2025-04-07T20:30:41.181-06:00 level=WARN source=ggml.go:152 msg="key not found" key=gemma3.attention.layer_norm_rms_epsilon default=9.999999974752427e-07 time=2025-04-07T20:30:41.181-06:00 level=WARN source=ggml.go:152 msg="key not found" key=gemma3.rope.local.freq_base default=10000 time=2025-04-07T20:30:41.181-06:00 level=WARN source=ggml.go:152 msg="key not found" key=gemma3.rope.global.freq_base default=1e+06 time=2025-04-07T20:30:41.181-06:00 level=WARN source=ggml.go:152 msg="key not found" key=gemma3.rope.freq_scale default=1 time=2025-04-07T20:30:41.181-06:00 level=WARN source=ggml.go:152 msg="key not found" key=gemma3.mm_tokens_per_image default=256 time=2025-04-07T20:30:41.182-06:00 level=INFO source=server.go:405 msg="starting llama server" cmd="C:\\Users\\jamie\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --ollama-engine --model C:\\Users\\jamie\\.ollama\\models\\blobs\\sha256-e8ad13eff07a78d89926e9e8b882317d082ef5bf9768ad7b50fcdbbcd63748de --ctx-size 8192 --batch-size 512 --n-gpu-layers 49 --threads 8 --no-mmap --parallel 4 --port 65180" time=2025-04-07T20:30:41.184-06:00 level=INFO source=sched.go:451 msg="loaded runners" count=1 time=2025-04-07T20:30:41.184-06:00 level=INFO source=server.go:580 msg="waiting for llama runner to start responding" time=2025-04-07T20:30:41.184-06:00 level=INFO source=server.go:614 msg="waiting for server to become available" status="llm server error" time=2025-04-07T20:30:41.194-06:00 level=INFO source=runner.go:816 msg="starting ollama engine" time=2025-04-07T20:30:41.195-06:00 level=INFO source=runner.go:879 msg="Server listening on 127.0.0.1:65180" time=2025-04-07T20:30:41.247-06:00 level=WARN source=ggml.go:152 msg="key not found" key=general.name default="" time=2025-04-07T20:30:41.247-06:00 level=WARN source=ggml.go:152 msg="key not found" key=general.description default="" time=2025-04-07T20:30:41.247-06:00 level=INFO source=ggml.go:67 msg="" architecture=gemma3 file_type=Q4_K_M name="" description="" num_tensors=1065 num_key_values=37 ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 CUDA devices: Device 0: NVIDIA GeForce RTX 4070 Ti SUPER, compute capability 8.9, VMM: yes load_backend: loaded CUDA backend from C:\Users\jamie\AppData\Local\Programs\Ollama\lib\ollama\cuda_v12\ggml-cuda.dll load_backend: loaded CPU backend from C:\Users\jamie\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-alderlake.dll time=2025-04-07T20:30:41.306-06:00 level=INFO source=ggml.go:109 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX_VNNI=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang) time=2025-04-07T20:30:41.374-06:00 level=INFO source=ggml.go:289 msg="model weights" buffer=CUDA0 size="7.6 GiB" time=2025-04-07T20:30:41.374-06:00 level=INFO source=ggml.go:289 msg="model weights" buffer=CPU size="787.5 MiB" time=2025-04-07T20:30:41.435-06:00 level=INFO source=server.go:614 msg="waiting for server to become available" status="llm server loading model" time=2025-04-07T20:30:42.552-06:00 level=INFO source=ggml.go:388 msg="compute graph" backend=CUDA0 buffer_type=CUDA0 time=2025-04-07T20:30:42.552-06:00 level=INFO source=ggml.go:388 msg="compute graph" backend=CPU buffer_type=CUDA_Host time=2025-04-07T20:30:42.553-06:00 level=WARN source=ggml.go:152 msg="key not found" key=tokenizer.ggml.add_eot_token default=false time=2025-04-07T20:30:42.557-06:00 level=WARN source=ggml.go:152 msg="key not found" key=gemma3.attention.layer_norm_rms_epsilon default=9.999999974752427e-07 time=2025-04-07T20:30:42.557-06:00 level=WARN source=ggml.go:152 msg="key not found" key=gemma3.rope.local.freq_base default=10000 time=2025-04-07T20:30:42.557-06:00 level=WARN source=ggml.go:152 msg="key not found" key=gemma3.rope.global.freq_base default=1e+06 time=2025-04-07T20:30:42.557-06:00 level=WARN source=ggml.go:152 msg="key not found" key=gemma3.rope.freq_scale default=1 time=2025-04-07T20:30:42.557-06:00 level=WARN source=ggml.go:152 msg="key not found" key=gemma3.mm_tokens_per_image default=256 time=2025-04-07T20:30:42.687-06:00 level=INFO source=server.go:619 msg="llama runner started in 1.50 seconds" [GIN] 2025/04/07 - 20:30:42 | 200 | 1.6727767s | 127.0.0.1 | POST "/api/generate" [GIN] 2025/04/07 - 20:32:05 | 200 | 4.7126094s | 127.0.0.1 | POST "/api/chat" [GIN] 2025/04/07 - 20:33:08 | 200 | 1.7394874s | 127.0.0.1 | POST "/api/chat" [GIN] 2025/04/07 - 20:33:54 | 200 | 2.7065745s | 127.0.0.1 | POST "/api/chat" [GIN] 2025/04/07 - 20:35:10 | 200 | 1.9086497s | 127.0.0.1 | POST "/api/chat" [GIN] 2025/04/07 - 20:35:54 | 200 | 1.5301941s | 127.0.0.1 | POST "/api/chat" [GIN] 2025/04/07 - 20:36:54 | 200 | 11.0824127s | 127.0.0.1 | POST "/api/chat"