Skip to content
Merged
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
09f716d
Add llama_model_quantize_params parameters
EAddario Mar 13, 2025
ac908af
Add new quantize parameters parsing and validation
EAddario Mar 13, 2025
337d979
Update usage
EAddario Mar 13, 2025
6f8d16d
Add new parameters defaults
EAddario Mar 13, 2025
71c9f93
Add new quantization parameters logic
EAddario Mar 13, 2025
8e18131
Add llama_model_quantize_params parameters
EAddario Mar 13, 2025
a77d947
Add new quantize parameters parsing and validation
EAddario Mar 13, 2025
2414eaa
Update usage
EAddario Mar 13, 2025
0dd66b8
Add new parameters defaults
EAddario Mar 13, 2025
1d841c6
Add new quantization parameters logic
EAddario Mar 13, 2025
120f71b
Merge main changes into branch
EAddario Mar 14, 2025
dbcc0b5
Merge branch 'master' into quantize
EAddario Mar 14, 2025
d86de03
Minor refactoring as per the contributors' coding guidelines
EAddario Mar 14, 2025
99bae5e
Update descriptions to match existing style
EAddario Mar 14, 2025
60b0a53
Merge branch 'master' into quantize
EAddario Mar 14, 2025
3e2063d
Merge branch 'master' into quantize
EAddario Mar 16, 2025
b99fa62
Merge branch 'master' into quantize
EAddario Mar 19, 2025
f97b693
Add llama_model_quantize_params parameters
EAddario Mar 19, 2025
f11e3da
Add new quantize parameters parsing and validation
EAddario Mar 19, 2025
ad1e352
Update usage
EAddario Mar 19, 2025
4e5c96a
Add new parameters defaults
EAddario Mar 19, 2025
9b3ccb5
Add new quantization parameters logic
EAddario Mar 19, 2025
35f45f1
Minor refactoring as per the contributors' guidelines
EAddario Mar 19, 2025
071e9ef
Merge branch 'master' into quantize
EAddario Mar 22, 2025
54e13cf
Implement general --tensor-type instead of tensor-specific command op…
EAddario Mar 29, 2025
31d642c
Merge branch 'master' into quantize
EAddario Mar 29, 2025
b3c7db5
Fix implied type bug
EAddario Mar 30, 2025
625f0ae
Restore missing #includes
EAddario Mar 31, 2025
2fd0b41
Add regex capability for tensor selection
EAddario Apr 1, 2025
3e9f565
Merge branch 'master' into quantize
EAddario Apr 2, 2025
054ede4
Refactor function name and update ALLOWED_TENSOR_TYPE
EAddario Apr 3, 2025
5a304b8
Add missing #include
EAddario Apr 3, 2025
1acb9f4
Handle edge case when tensor name is cls.output
EAddario Apr 3, 2025
04604a4
Minor logging improvement
EAddario Apr 7, 2025
30443a5
Merge branch 'master' into quantize
EAddario Apr 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add llama_model_quantize_params parameters
  • Loading branch information
EAddario committed Mar 14, 2025
commit 8e18131b53b2cf273d4bdf7ae87af9166d33e895
30 changes: 19 additions & 11 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -355,17 +355,25 @@ extern "C" {

// model quantization parameters
typedef struct llama_model_quantize_params {
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype
enum ggml_type output_tensor_type; // output tensor type
enum ggml_type token_embedding_type; // token embeddings tensor type
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // quantize all tensors to the default type
bool keep_split; // quantize to the same number of shards
void * imatrix; // pointer to importance matrix data
void * kv_overrides; // pointer to vector containing overrides
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype
enum ggml_type output_tensor_type; // output tensor type
enum ggml_type token_embedding_type; // token embeddings tensor type
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // quantize all tensors to the default type
bool keep_split; // quantize to the same number of shards
void * imatrix; // pointer to importance matrix data
void * kv_overrides; // pointer to vector containing overrides
enum ggml_type attn_q_tensor_type; // attention query tensor type
enum ggml_type attn_k_tensor_type; // attention key tensor type
enum ggml_type attn_v_tensor_type; // attention value tensor type
enum ggml_type attn_qkv_tensor_type; // attention query, key and value tensor type
enum ggml_type attn_output_tensor_type; // attention output tensor type
enum ggml_type ffn_up_tensor_type; // feedforward up tensor type
enum ggml_type ffn_gate_tensor_type; // feedforward gate tensor type
enum ggml_type ffn_down_tensor_type; // feedforward down tensor type
} llama_model_quantize_params;

typedef struct llama_logit_bias {
Expand Down