LiangJiang commited on
Commit
2bb8988
·
verified ·
1 Parent(s): 1265f1f

Add files using upload-large-folder tool

Browse files
config.json CHANGED
@@ -1,44 +1,63 @@
1
  {
2
- "architectures": [
3
- "BailingMoeForCausalLM"
4
- ],
5
- "attention_dropout": 0.0,
6
- "auto_map": {
7
- "AutoConfig": "configuration_bailing_moe.BailingMoeConfig",
8
- "AutoModel": "modeling_bailing_moe.BailingMoeModel",
9
- "AutoModelForCausalLM": "modeling_bailing_moe.BailingMoeForCausalLM"
10
- },
11
- "eos_token_id": 126081,
12
- "pad_token_id": 126081,
13
- "first_k_dense_replace": 0,
14
- "hidden_act": "silu",
15
- "hidden_size": 2048,
16
- "initializer_range": 0.006,
17
- "intermediate_size": 1408,
18
- "max_position_embeddings": 32768,
19
- "model_type": "bailing_moe",
20
- "moe_intermediate_size": 1408,
21
- "num_experts": 64,
22
- "num_shared_experts": 2,
23
- "norm_topk_prob": true,
24
- "num_attention_heads": 16,
25
- "num_experts_per_tok": 6,
26
- "num_hidden_layers": 28,
27
- "num_key_value_heads": 4,
28
- "pretraining_tp": 1,
29
- "rms_norm_eps": 1e-06,
30
- "rope_scaling": null,
31
- "rope_theta": 600000,
32
- "tie_word_embeddings": false,
33
- "torch_dtype": "bfloat16",
34
- "transformers_version": "4.40.0",
35
- "use_cache": true,
36
- "use_bias": false,
37
- "use_qkv_bias": false,
38
- "vocab_size": 126464,
39
- "output_router_logits": false,
40
- "embedding_dropout": 0.0,
41
- "norm_head": false,
42
- "norm_softmax": false,
43
- "output_dropout": 0.0
44
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
+ "all2all_overlap": true,
3
+ "architectures": [
4
+ "BailingMoeForCausalLM"
5
+ ],
6
+ "atorch_backend": "MegaBlocks",
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_bailing_moe.BailingMoeConfig",
10
+ "AutoModel": "modeling_bailing_moe.BailingMoeModel",
11
+ "AutoModelForCausalLM": "modeling_bailing_moe.BailingMoeForCausalLM"
12
+ },
13
+ "bos_token_id": 126080,
14
+ "dispatcher_type": "AllToAll",
15
+ "embedding_dropout": 0.0,
16
+ "eos_token_id": 126081,
17
+ "expert_model_parallelism": false,
18
+ "first_k_dense_replace": 0,
19
+ "head_dim": 128,
20
+ "hidden_act": "silu",
21
+ "hidden_size": 2048,
22
+ "initializer_range": 0.006,
23
+ "intermediate_size": 1408,
24
+ "is_scale_gradient": true,
25
+ "last_logits_l2_alpha": -1.0,
26
+ "max_position_embeddings": 32768,
27
+ "max_window_layers": 28,
28
+ "merge_w1_v1": false,
29
+ "model_type": "bailing_moe",
30
+ "moe_impl": "raw",
31
+ "moe_intermediate_size": 1408,
32
+ "moe_mlp_prefix": false,
33
+ "norm_head": false,
34
+ "norm_softmax": false,
35
+ "norm_topk_prob": true,
36
+ "num_attention_heads": 16,
37
+ "num_experts": 64,
38
+ "num_experts_per_tok": 6,
39
+ "num_hidden_layers": 28,
40
+ "num_key_value_heads": 4,
41
+ "num_shared_experts": 2,
42
+ "output_dropout": 0.0,
43
+ "output_router_logits": false,
44
+ "pad_token_id": 126081,
45
+ "pretraining_tp": 1,
46
+ "rms_norm_eps": 1e-06,
47
+ "rope_scaling": null,
48
+ "rope_theta": 600000,
49
+ "router_balance_loss_alpha": 0,
50
+ "router_group_balance_loss_alpha": 0.0,
51
+ "router_z_loss_alpha": 0,
52
+ "sliding_window": 4096,
53
+ "tie_word_embeddings": false,
54
+ "torch_dtype": "bfloat16",
55
+ "transformers_version": "4.51.1",
56
+ "transpose_w1": true,
57
+ "use_bias": false,
58
+ "use_cache": true,
59
+ "use_qkv_bias": false,
60
+ "use_sliding_window": false,
61
+ "use_swiglu": false,
62
+ "vocab_size": 126464
63
+ }
generation_config.json CHANGED
@@ -1,5 +1,6 @@
1
  {
2
  "_from_model_config": true,
 
3
  "eos_token_id": 126081,
4
  "pad_token_id": 126081,
5
  "transformers_version": "4.51.1"
 
1
  {
2
  "_from_model_config": true,
3
+ "bos_token_id": 126080,
4
  "eos_token_id": 126081,
5
  "pad_token_id": 126081,
6
  "transformers_version": "4.51.1"
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5092192b96ac711864b21fd6e772e28b7450db1cf921e723b5a74202df1e0d59
3
  size 10000012352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cab0b82ec17af068c667447d6b1f25ead0d84a641aaa8b867c3a96a70868110
3
  size 10000012352
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:533debde11b9f418a9525207ad3b1e7f31de1b21177213b58e3376935b8aab48
3
  size 9997403496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:919e7e62a34e498ffbad9ba1d636f445682d041cff7466a167a7f990517c8694
3
  size 9997403496
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2af3073fa0b56c8ae033d4d0e4cb43b5ba83f1cc6f189674d14005b14b96385a
3
  size 9995576736
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5e803b5a48a21a25bf65b2972b0eedf77a091e3ad59825bb3f9b910de40bf73
3
  size 9995576736
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:caba3e3a08933b84ef7d1be48fea5308b1bd75dcb90e3d00b5a37e60f014110c
3
  size 3611653272
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec9b071c31c1ee2ee113311bdb1ec917b895ea09089adef415abc39d015ff044
3
  size 3611653272
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -2129,6 +2129,22 @@
2129
  "rstrip": false,
2130
  "single_word": false,
2131
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2132
  }
2133
  },
2134
  "additional_special_tokens": [
@@ -2140,16 +2156,16 @@
2140
  "<|number_end|>"
2141
  ],
2142
  "bos_token": "<|startoftext|>",
2143
- "chat_template": "{% set system_present = false %}{% set thinking_option = 'on' %}{% for message in messages %}{% set role = message['role'] | lower %}{% if role == 'system' %}{% set system_present = true %}{% endif %}{% endfor %}{% if not system_present %}{{ '<role>SYSTEM</role>detailed thinking ' + thinking_option }}{% endif %}{% for message in messages %}{% set role = message['role'] | lower %}{% if role == 'user' %}{% set role = 'HUMAN' %}{% endif %}{% set role = role | upper %}{{ '<role>' + role + '</role>' + message['content'] + ('\ndetailed thinking ' + thinking_option if role == 'SYSTEM' else '') }}{% endfor %}{% if add_generation_prompt %}{{ '<role>ASSISTANT</role><think>\n' }}{% endif %}",
2144
  "clean_up_tokenization_spaces": false,
2145
  "cls_token": "[CLS]",
2146
  "eos_token": "<|endoftext|>",
 
2147
  "fast_tokenizer": true,
2148
  "gmask_token": "[gMASK]",
2149
  "merges_file": null,
2150
  "model_max_length": 1000000000000000019884624838656,
2151
  "pad_token": "<|endoftext|>",
2152
- "tokenizer_class": "PreTrainedTokenizerFast",
2153
- "trust_remote_code": true,
2154
- "vocab_file": null
2155
  }
 
2129
  "rstrip": false,
2130
  "single_word": false,
2131
  "special": true
2132
+ },
2133
+ "126346": {
2134
+ "content": "<think>",
2135
+ "lstrip": false,
2136
+ "normalized": false,
2137
+ "rstrip": false,
2138
+ "single_word": false,
2139
+ "special": false
2140
+ },
2141
+ "126347": {
2142
+ "content": "</think>",
2143
+ "lstrip": false,
2144
+ "normalized": false,
2145
+ "rstrip": false,
2146
+ "single_word": false,
2147
+ "special": false
2148
  }
2149
  },
2150
  "additional_special_tokens": [
 
2156
  "<|number_end|>"
2157
  ],
2158
  "bos_token": "<|startoftext|>",
2159
+ "chat_template": "{% for message in messages %}{% set role = message['role'] | lower %}{% if role == 'user' %}{% set role = 'HUMAN' %}{% endif %}{% set role = role | upper %}{{ '<role>' + role + '</role>' + message['content'] }}{% endfor %}{% if add_generation_prompt %}{{ '<role>ASSISTANT</role>' }}{% endif %}",
2160
  "clean_up_tokenization_spaces": false,
2161
  "cls_token": "[CLS]",
2162
  "eos_token": "<|endoftext|>",
2163
+ "extra_special_tokens": {},
2164
  "fast_tokenizer": true,
2165
  "gmask_token": "[gMASK]",
2166
  "merges_file": null,
2167
  "model_max_length": 1000000000000000019884624838656,
2168
  "pad_token": "<|endoftext|>",
2169
+ "tokenizer_class": "PreTrainedTokenizer",
2170
+ "trust_remote_code": true
 
2171
  }