1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
| [GIN] 2025/03/09 - 08:04:36 | 200 | 0s | 192.168.11.100 | HEAD "/"
[GIN] 2025/03/09 - 08:04:36 | 200 | 12.6348ms | 192.168.11.100 | POST "/api/show"
time=2025-03-09T08:04:36.409+08:00 level=WARN source=ggml.go:132 msg="key not found" key=qwen2.attention.key_length default=128
time=2025-03-09T08:04:36.409+08:00 level=WARN source=ggml.go:132 msg="key not found" key=qwen2.attention.value_length default=128
time=2025-03-09T08:04:36.409+08:00 level=INFO source=sched.go:715 msg="new model will fit in available VRAM in single GPU, loading" model=D:\Software\AI\ollama\model\blobs\sha256-c62ccde5630c20c8a9cf601861d31977d07450cad6dfdf1c661aab307107bddb gpu=GPU-213bb6a8-2adc-9591-a814-ae650a18a4f2 parallel=4 available=23918694400 required="21.5 GiB"
time=2025-03-09T08:04:36.421+08:00 level=INFO source=server.go:97 msg="system memory" total="31.8 GiB" free="18.7 GiB" free_swap="22.0 GiB"
time=2025-03-09T08:04:36.421+08:00 level=WARN source=ggml.go:132 msg="key not found" key=qwen2.attention.key_length default=128
time=2025-03-09T08:04:36.421+08:00 level=WARN source=ggml.go:132 msg="key not found" key=qwen2.attention.value_length default=128
time=2025-03-09T08:04:36.422+08:00 level=INFO source=server.go:130 msg=offload library=cuda layers.requested=-1 layers.model=65 layers.offload=65 layers.split="" memory.available="[22.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="21.5 GiB" memory.required.partial="21.5 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[21.5 GiB]" memory.weights.total="19.5 GiB" memory.weights.repeating="18.9 GiB" memory.weights.nonrepeating="609.1 MiB" memory.graph.full="676.0 MiB" memory.graph.partial="916.1 MiB"
time=2025-03-09T08:04:36.424+08:00 level=INFO source=server.go:380 msg="starting llama server" cmd="D:\\Software\\AI\\ollama\\ollama.exe runner --model D:\\Software\\AI\\ollama\\model\\blobs\\sha256-c62ccde5630c20c8a9cf601861d31977d07450cad6dfdf1c661aab307107bddb --ctx-size 8192 --batch-size 512 --n-gpu-layers 65 --threads 8 --no-mmap --parallel 4 --port 52036"
time=2025-03-09T08:04:36.426+08:00 level=INFO source=sched.go:450 msg="loaded runners" count=1
time=2025-03-09T08:04:36.426+08:00 level=INFO source=server.go:557 msg="waiting for llama runner to start responding"
time=2025-03-09T08:04:36.427+08:00 level=INFO source=server.go:591 msg="waiting for server to become available" status="llm server error"
time=2025-03-09T08:04:36.443+08:00 level=INFO source=runner.go:932 msg="starting go runner"
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
load_backend: loaded CUDA backend from D:\Software\AI\ollama\lib\ollama\cuda_v12\ggml-cuda.dll
load_backend: loaded CPU backend from D:\Software\AI\ollama\lib\ollama\ggml-cpu-alderlake.dll
time=2025-03-09T08:04:36.596+08:00 level=INFO source=runner.go:935 msg=system info="CPU : LLAMAFILE = 1 | CPU : LLAMAFILE = 1 | CUDA : ARCHS = 600,610,620,700,720,750,800,860,870,890,900 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX_VNNI = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | LLAMAFILE = 1 | cgo(clang)" threads=8
time=2025-03-09T08:04:36.596+08:00 level=INFO source=runner.go:993 msg="Server listening on 127.0.0.1:52036"
llama_load_model_from_file: using device CUDA0 (NVIDIA GeForce RTX 4090) - 23008 MiB free
time=2025-03-09T08:04:36.677+08:00 level=INFO source=server.go:591 msg="waiting for server to become available" status="llm server loading model"
llama_model_loader: loaded meta data with 33 key-value pairs and 771 tensors from D:\Software\AI\ollama\model\blobs\sha256-c62ccde5630c20c8a9cf601861d31977d07450cad6dfdf1c661aab307107bddb (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv 0: general.architecture str = qwen2
llama_model_loader: - kv 1: general.type str = model
llama_model_loader: - kv 2: general.name str = QwQ 32B
llama_model_loader: - kv 3: general.basename str = QwQ
llama_model_loader: - kv 4: general.size_label str = 32B
llama_model_loader: - kv 5: general.license str = apache-2.0
llama_model_loader: - kv 6: general.license.link str = https://huggingface.co/Qwen/QWQ-32B/b...
llama_model_loader: - kv 7: general.base_model.count u32 = 1
llama_model_loader: - kv 8: general.base_model.0.name str = Qwen2.5 32B
llama_model_loader: - kv 9: general.base_model.0.organization str = Qwen
llama_model_loader: - kv 10: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen2.5-32B
llama_model_loader: - kv 11: general.tags arr[str,2] = ["chat", "text-generation"]
llama_model_loader: - kv 12: general.languages arr[str,1] = ["en"]
llama_model_loader: - kv 13: qwen2.block_count u32 = 64
llama_model_loader: - kv 14: qwen2.context_length u32 = 131072
llama_model_loader: - kv 15: qwen2.embedding_length u32 = 5120
llama_model_loader: - kv 16: qwen2.feed_forward_length u32 = 27648
llama_model_loader: - kv 17: qwen2.attention.head_count u32 = 40
llama_model_loader: - kv 18: qwen2.attention.head_count_kv u32 = 8
llama_model_loader: - kv 19: qwen2.rope.freq_base f32 = 1000000.000000
llama_model_loader: - kv 20: qwen2.attention.layer_norm_rms_epsilon f32 = 0.000010
llama_model_loader: - kv 21: tokenizer.ggml.model str = gpt2
llama_model_loader: - kv 22: tokenizer.ggml.pre str = qwen2
llama_model_loader: - kv 23: tokenizer.ggml.tokens arr[str,152064] = ["!", "\"", "#", "$", "%", "&", "'", ...
llama_model_loader: - kv 24: tokenizer.ggml.token_type arr[i32,152064] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv 25: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
llama_model_loader: - kv 26: tokenizer.ggml.eos_token_id u32 = 151645
llama_model_loader: - kv 27: tokenizer.ggml.padding_token_id u32 = 151643
llama_model_loader: - kv 28: tokenizer.ggml.bos_token_id u32 = 151643
llama_model_loader: - kv 29: tokenizer.ggml.add_bos_token bool = false
llama_model_loader: - kv 30: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
llama_model_loader: - kv 31: general.quantization_version u32 = 2
llama_model_loader: - kv 32: general.file_type u32 = 15
llama_model_loader: - type f32: 321 tensors
llama_model_loader: - type q4_K: 385 tensors
llama_model_loader: - type q6_K: 65 tensors
llm_load_vocab: special tokens cache size = 26
llm_load_vocab: token to piece cache size = 0.9311 MB
llm_load_print_meta: format = GGUF V3 (latest)
llm_load_print_meta: arch = qwen2
llm_load_print_meta: vocab type = BPE
llm_load_print_meta: n_vocab = 152064
llm_load_print_meta: n_merges = 151387
llm_load_print_meta: vocab_only = 0
llm_load_print_meta: n_ctx_train = 131072
llm_load_print_meta: n_embd = 5120
llm_load_print_meta: n_layer = 64
llm_load_print_meta: n_head = 40
llm_load_print_meta: n_head_kv = 8
llm_load_print_meta: n_rot = 128
llm_load_print_meta: n_swa = 0
llm_load_print_meta: n_embd_head_k = 128
llm_load_print_meta: n_embd_head_v = 128
llm_load_print_meta: n_gqa = 5
llm_load_print_meta: n_embd_k_gqa = 1024
llm_load_print_meta: n_embd_v_gqa = 1024
llm_load_print_meta: f_norm_eps = 0.0e+00
llm_load_print_meta: f_norm_rms_eps = 1.0e-05
llm_load_print_meta: f_clamp_kqv = 0.0e+00
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
llm_load_print_meta: f_logit_scale = 0.0e+00
llm_load_print_meta: n_ff = 27648
llm_load_print_meta: n_expert = 0
llm_load_print_meta: n_expert_used = 0
llm_load_print_meta: causal attn = 1
llm_load_print_meta: pooling type = 0
llm_load_print_meta: rope type = 2
llm_load_print_meta: rope scaling = linear
llm_load_print_meta: freq_base_train = 1000000.0
llm_load_print_meta: freq_scale_train = 1
llm_load_print_meta: n_ctx_orig_yarn = 131072
llm_load_print_meta: rope_finetuned = unknown
llm_load_print_meta: ssm_d_conv = 0
llm_load_print_meta: ssm_d_inner = 0
llm_load_print_meta: ssm_d_state = 0
llm_load_print_meta: ssm_dt_rank = 0
llm_load_print_meta: ssm_dt_b_c_rms = 0
llm_load_print_meta: model type = 32B
llm_load_print_meta: model ftype = Q4_K - Medium
llm_load_print_meta: model params = 32.76 B
llm_load_print_meta: model size = 18.48 GiB (4.85 BPW)
llm_load_print_meta: general.name = QwQ 32B
llm_load_print_meta: BOS token = 151643 '<|endoftext|>'
llm_load_print_meta: EOS token = 151645 '<|im_end|>'
llm_load_print_meta: EOT token = 151645 '<|im_end|>'
llm_load_print_meta: PAD token = 151643 '<|endoftext|>'
llm_load_print_meta: LF token = 148848 'ÄĬ'
llm_load_print_meta: FIM PRE token = 151659 '<|fim_prefix|>'
llm_load_print_meta: FIM SUF token = 151661 '<|fim_suffix|>'
llm_load_print_meta: FIM MID token = 151660 '<|fim_middle|>'
llm_load_print_meta: FIM PAD token = 151662 '<|fim_pad|>'
llm_load_print_meta: FIM REP token = 151663 '<|repo_name|>'
llm_load_print_meta: FIM SEP token = 151664 '<|file_sep|>'
llm_load_print_meta: EOG token = 151643 '<|endoftext|>'
llm_load_print_meta: EOG token = 151645 '<|im_end|>'
llm_load_print_meta: EOG token = 151662 '<|fim_pad|>'
llm_load_print_meta: EOG token = 151663 '<|repo_name|>'
llm_load_print_meta: EOG token = 151664 '<|file_sep|>'
llm_load_print_meta: max token length = 256
llm_load_tensors: offloading 64 repeating layers to GPU
llm_load_tensors: offloading output layer to GPU
llm_load_tensors: offloaded 65/65 layers to GPU
llm_load_tensors: CPU model buffer size = 417.66 MiB
llm_load_tensors: CUDA0 model buffer size = 18508.35 MiB
llama_new_context_with_model: n_seq_max = 4
llama_new_context_with_model: n_ctx = 8192
llama_new_context_with_model: n_ctx_per_seq = 2048
llama_new_context_with_model: n_batch = 2048
llama_new_context_with_model: n_ubatch = 512
llama_new_context_with_model: flash_attn = 0
llama_new_context_with_model: freq_base = 1000000.0
llama_new_context_with_model: freq_scale = 1
llama_new_context_with_model: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_init: kv_size = 8192, offload = 1, type_k = 'f16', type_v = 'f16', n_layer = 64, can_shift = 1
llama_kv_cache_init: CUDA0 KV buffer size = 2048.00 MiB
llama_new_context_with_model: KV self size = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
llama_new_context_with_model: CUDA_Host output buffer size = 2.40 MiB
llama_new_context_with_model: CUDA0 compute buffer size = 696.00 MiB
llama_new_context_with_model: CUDA_Host compute buffer size = 26.01 MiB
llama_new_context_with_model: graph nodes = 2246
llama_new_context_with_model: graph splits = 2
time=2025-03-09T08:04:43.186+08:00 level=INFO source=server.go:596 msg="llama runner started in 6.76 seconds"
|