Add files using upload-large-folder tool
Browse files- config.json +62 -43
- generation_config.json +1 -0
- model-00001-of-00004.safetensors +1 -1
- model-00002-of-00004.safetensors +1 -1
- model-00003-of-00004.safetensors +1 -1
- model-00004-of-00004.safetensors +1 -1
- tokenizer.json +0 -0
- tokenizer_config.json +20 -4
config.json
CHANGED
|
@@ -1,44 +1,63 @@
|
|
| 1 |
{
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
"
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
{
|
| 2 |
+
"all2all_overlap": true,
|
| 3 |
+
"architectures": [
|
| 4 |
+
"BailingMoeForCausalLM"
|
| 5 |
+
],
|
| 6 |
+
"atorch_backend": "MegaBlocks",
|
| 7 |
+
"attention_dropout": 0.0,
|
| 8 |
+
"auto_map": {
|
| 9 |
+
"AutoConfig": "configuration_bailing_moe.BailingMoeConfig",
|
| 10 |
+
"AutoModel": "modeling_bailing_moe.BailingMoeModel",
|
| 11 |
+
"AutoModelForCausalLM": "modeling_bailing_moe.BailingMoeForCausalLM"
|
| 12 |
+
},
|
| 13 |
+
"bos_token_id": 126080,
|
| 14 |
+
"dispatcher_type": "AllToAll",
|
| 15 |
+
"embedding_dropout": 0.0,
|
| 16 |
+
"eos_token_id": 126081,
|
| 17 |
+
"expert_model_parallelism": false,
|
| 18 |
+
"first_k_dense_replace": 0,
|
| 19 |
+
"head_dim": 128,
|
| 20 |
+
"hidden_act": "silu",
|
| 21 |
+
"hidden_size": 2048,
|
| 22 |
+
"initializer_range": 0.006,
|
| 23 |
+
"intermediate_size": 1408,
|
| 24 |
+
"is_scale_gradient": true,
|
| 25 |
+
"last_logits_l2_alpha": -1.0,
|
| 26 |
+
"max_position_embeddings": 32768,
|
| 27 |
+
"max_window_layers": 28,
|
| 28 |
+
"merge_w1_v1": false,
|
| 29 |
+
"model_type": "bailing_moe",
|
| 30 |
+
"moe_impl": "raw",
|
| 31 |
+
"moe_intermediate_size": 1408,
|
| 32 |
+
"moe_mlp_prefix": false,
|
| 33 |
+
"norm_head": false,
|
| 34 |
+
"norm_softmax": false,
|
| 35 |
+
"norm_topk_prob": true,
|
| 36 |
+
"num_attention_heads": 16,
|
| 37 |
+
"num_experts": 64,
|
| 38 |
+
"num_experts_per_tok": 6,
|
| 39 |
+
"num_hidden_layers": 28,
|
| 40 |
+
"num_key_value_heads": 4,
|
| 41 |
+
"num_shared_experts": 2,
|
| 42 |
+
"output_dropout": 0.0,
|
| 43 |
+
"output_router_logits": false,
|
| 44 |
+
"pad_token_id": 126081,
|
| 45 |
+
"pretraining_tp": 1,
|
| 46 |
+
"rms_norm_eps": 1e-06,
|
| 47 |
+
"rope_scaling": null,
|
| 48 |
+
"rope_theta": 600000,
|
| 49 |
+
"router_balance_loss_alpha": 0,
|
| 50 |
+
"router_group_balance_loss_alpha": 0.0,
|
| 51 |
+
"router_z_loss_alpha": 0,
|
| 52 |
+
"sliding_window": 4096,
|
| 53 |
+
"tie_word_embeddings": false,
|
| 54 |
+
"torch_dtype": "bfloat16",
|
| 55 |
+
"transformers_version": "4.51.1",
|
| 56 |
+
"transpose_w1": true,
|
| 57 |
+
"use_bias": false,
|
| 58 |
+
"use_cache": true,
|
| 59 |
+
"use_qkv_bias": false,
|
| 60 |
+
"use_sliding_window": false,
|
| 61 |
+
"use_swiglu": false,
|
| 62 |
+
"vocab_size": 126464
|
| 63 |
+
}
|
generation_config.json
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"_from_model_config": true,
|
|
|
|
| 3 |
"eos_token_id": 126081,
|
| 4 |
"pad_token_id": 126081,
|
| 5 |
"transformers_version": "4.51.1"
|
|
|
|
| 1 |
{
|
| 2 |
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 126080,
|
| 4 |
"eos_token_id": 126081,
|
| 5 |
"pad_token_id": 126081,
|
| 6 |
"transformers_version": "4.51.1"
|
model-00001-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 10000012352
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1cab0b82ec17af068c667447d6b1f25ead0d84a641aaa8b867c3a96a70868110
|
| 3 |
size 10000012352
|
model-00002-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 9997403496
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:919e7e62a34e498ffbad9ba1d636f445682d041cff7466a167a7f990517c8694
|
| 3 |
size 9997403496
|
model-00003-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 9995576736
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b5e803b5a48a21a25bf65b2972b0eedf77a091e3ad59825bb3f9b910de40bf73
|
| 3 |
size 9995576736
|
model-00004-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 3611653272
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ec9b071c31c1ee2ee113311bdb1ec917b895ea09089adef415abc39d015ff044
|
| 3 |
size 3611653272
|
tokenizer.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
CHANGED
|
@@ -2129,6 +2129,22 @@
|
|
| 2129 |
"rstrip": false,
|
| 2130 |
"single_word": false,
|
| 2131 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2132 |
}
|
| 2133 |
},
|
| 2134 |
"additional_special_tokens": [
|
|
@@ -2140,16 +2156,16 @@
|
|
| 2140 |
"<|number_end|>"
|
| 2141 |
],
|
| 2142 |
"bos_token": "<|startoftext|>",
|
| 2143 |
-
"chat_template": "{%
|
| 2144 |
"clean_up_tokenization_spaces": false,
|
| 2145 |
"cls_token": "[CLS]",
|
| 2146 |
"eos_token": "<|endoftext|>",
|
|
|
|
| 2147 |
"fast_tokenizer": true,
|
| 2148 |
"gmask_token": "[gMASK]",
|
| 2149 |
"merges_file": null,
|
| 2150 |
"model_max_length": 1000000000000000019884624838656,
|
| 2151 |
"pad_token": "<|endoftext|>",
|
| 2152 |
-
"tokenizer_class": "
|
| 2153 |
-
"trust_remote_code": true
|
| 2154 |
-
"vocab_file": null
|
| 2155 |
}
|
|
|
|
| 2129 |
"rstrip": false,
|
| 2130 |
"single_word": false,
|
| 2131 |
"special": true
|
| 2132 |
+
},
|
| 2133 |
+
"126346": {
|
| 2134 |
+
"content": "<think>",
|
| 2135 |
+
"lstrip": false,
|
| 2136 |
+
"normalized": false,
|
| 2137 |
+
"rstrip": false,
|
| 2138 |
+
"single_word": false,
|
| 2139 |
+
"special": false
|
| 2140 |
+
},
|
| 2141 |
+
"126347": {
|
| 2142 |
+
"content": "</think>",
|
| 2143 |
+
"lstrip": false,
|
| 2144 |
+
"normalized": false,
|
| 2145 |
+
"rstrip": false,
|
| 2146 |
+
"single_word": false,
|
| 2147 |
+
"special": false
|
| 2148 |
}
|
| 2149 |
},
|
| 2150 |
"additional_special_tokens": [
|
|
|
|
| 2156 |
"<|number_end|>"
|
| 2157 |
],
|
| 2158 |
"bos_token": "<|startoftext|>",
|
| 2159 |
+
"chat_template": "{% for message in messages %}{% set role = message['role'] | lower %}{% if role == 'user' %}{% set role = 'HUMAN' %}{% endif %}{% set role = role | upper %}{{ '<role>' + role + '</role>' + message['content'] }}{% endfor %}{% if add_generation_prompt %}{{ '<role>ASSISTANT</role>' }}{% endif %}",
|
| 2160 |
"clean_up_tokenization_spaces": false,
|
| 2161 |
"cls_token": "[CLS]",
|
| 2162 |
"eos_token": "<|endoftext|>",
|
| 2163 |
+
"extra_special_tokens": {},
|
| 2164 |
"fast_tokenizer": true,
|
| 2165 |
"gmask_token": "[gMASK]",
|
| 2166 |
"merges_file": null,
|
| 2167 |
"model_max_length": 1000000000000000019884624838656,
|
| 2168 |
"pad_token": "<|endoftext|>",
|
| 2169 |
+
"tokenizer_class": "PreTrainedTokenizer",
|
| 2170 |
+
"trust_remote_code": true
|
|
|
|
| 2171 |
}
|