deepseek code sync from os branch

This commit is contained in:
twc
2025-03-21 11:39:14 +08:00
parent 17b5b5eb51
commit 48a47f64d6
17 changed files with 481 additions and 101 deletions

View File

@@ -221,16 +221,26 @@ class MFContextOperator(MFContextConfig):
return cls._instance
def __init__(self, config):
self.config = config
supported_kwargs = self._handle_data()
logger.debug('MFContextConfig load configs: %s', supported_kwargs)
super(MFContextOperator, self).__init__(**supported_kwargs)
use_past = self.config.get_value('model.model_config.use_past', False)
if not hasattr(self, 'train_precision_sync'):
self.train_precision_sync = None
if not hasattr(self, 'infer_precision_sync'):
self.infer_precision_sync = None
self.set_env(use_past)
if not hasattr(self, '_initailed'):
self.config = config
supported_kwargs = self._handle_data()
logger.debug('MFContextConfig load configs: %s', supported_kwargs)
super(MFContextOperator, self).__init__(**supported_kwargs)
use_past = self.config.get_value('model.model_config.use_past',
False)
if not hasattr(self, 'train_precision_sync'):
self.train_precision_sync = None
if not hasattr(self, 'infer_precision_sync'):
self.infer_precision_sync = None
self.set_env(use_past)
self._initailed = True
@classmethod
def get_mf_ctx_instance(cls):
"""Check if singleton Context exists."""
if cls._instance:
return cls.__new__(cls)
return None
def _handle_data(self):
ctx_config = self.config.get('context', {})

View File

@@ -28,6 +28,9 @@ from mindformers.experimental.infer.core.layers import ColumnParallelLinear, Row
from mindformers.experimental.infer.core.mapping import ReduceFromModelParallelRegion
# pylint: disable=C0412
from mindformers.experimental.infer.core.utils import get_tp_world_size
from mindformers.tools.utils import divide
try:
from mindspore.ops.auto_generate import (MoeComputeExpertTokens,
MoeFinalizeRouting,
@@ -230,23 +233,33 @@ class SharedParallelMLP(nn.Cell):
self.hidden_size = self.config.hidden_size
self.ffn_hidden_size = intermediate_size
self.mlp_has_gate = self.config.mlp_has_gate
self.w1 = Linear(
self.hidden_size,
self.ffn_hidden_size,
has_bias=self.has_bias,
transpose_b=True,
param_init_type=self.config.param_init_dtype,
compute_dtype=self.config.compute_dtype,
)
self.w3 = Linear(
self.hidden_size,
self.ffn_hidden_size,
has_bias=self.has_bias,
transpose_b=True,
param_init_type=self.config.param_init_dtype,
compute_dtype=self.config.compute_dtype,
)
self.ffn_concat = self.config.ffn_concat
if self.ffn_concat:
self.w_gate_hidden = Linear(
self.hidden_size,
self.ffn_hidden_size * 2,
has_bias=self.has_bias,
transpose_b=True,
param_init_type=self.config.param_init_dtype,
compute_dtype=self.config.compute_dtype,
)
else:
self.w1 = Linear(
self.hidden_size,
self.ffn_hidden_size,
has_bias=self.has_bias,
transpose_b=True,
param_init_type=self.config.param_init_dtype,
compute_dtype=self.config.compute_dtype,
)
self.w3 = Linear(
self.hidden_size,
self.ffn_hidden_size,
has_bias=self.has_bias,
transpose_b=True,
param_init_type=self.config.param_init_dtype,
compute_dtype=self.config.compute_dtype,
)
self.act_type = self.config.hidden_act
self.act_func = get_act_func(self.act_type)
@@ -265,8 +278,13 @@ class SharedParallelMLP(nn.Cell):
def construct(self, x):
""" Construct function of mlp block. """
gate = self.w1(x)
hidden = self.w3(x)
if self.ffn_concat:
gate_hidden_out = self.w_gate_hidden(x) # dp,1 -> dp, mp # dp,1 -> dp, mp
gate, hidden = mint.split(gate_hidden_out,
(self.ffn_hidden_size, self.ffn_hidden_size), -1)
else:
gate = self.w1(x)
hidden = self.w3(x)
gate = self.act_func(gate)
hidden = mint.mul(hidden, gate)
output = self.w2(hidden)
@@ -294,20 +312,48 @@ class RoutedParallelMLP(nn.Cell):
self.cast = P.Cast()
self.act_type = self.config.hidden_act
self.act_func = get_act_func(self.act_type)
self.ffn_concat = self.config.ffn_concat
tp_group_size = get_tp_world_size()
self.ffn_hidden_size_per_partition = divide(self.ffn_hidden_size, tp_group_size)
if self.ffn_concat:
self.w_gate_hidden = ColumnParallelLinear(
self.hidden_size,
self.ffn_hidden_size * 2,
config=self.config.parallel_config,
bias=self.has_bias,
transpose_b=True,
gather_output=False,
param_init_type=self.config.param_init_dtype,
compute_dtype=self.config.compute_dtype,
is_expert=True,
expert_num=self.config.moe_config.expert_num,
)
else:
self.w1 = ColumnParallelLinear(
self.hidden_size,
self.ffn_hidden_size,
config=self.config.parallel_config,
bias=self.has_bias,
transpose_b=True,
gather_output=False,
param_init_type=self.config.param_init_dtype,
compute_dtype=self.config.compute_dtype,
is_expert=True,
expert_num=self.config.moe_config.expert_num,
)
self.w1 = ColumnParallelLinear(
self.hidden_size,
self.ffn_hidden_size,
config=self.config.parallel_config,
bias=self.has_bias,
transpose_b=True,
gather_output=False,
param_init_type=self.config.param_init_dtype,
compute_dtype=self.config.compute_dtype,
is_expert=True,
expert_num=self.config.moe_config.expert_num,
)
self.w3 = ColumnParallelLinear(
self.hidden_size,
self.ffn_hidden_size,
config=self.config.parallel_config,
bias=self.has_bias,
transpose_b=True,
gather_output=False,
param_init_type=self.config.param_init_dtype,
compute_dtype=self.config.compute_dtype,
is_expert=True,
expert_num=self.config.moe_config.expert_num,
)
self.w2 = RowParallelLinear(
self.ffn_hidden_size,
self.hidden_size,
@@ -323,27 +369,18 @@ class RoutedParallelMLP(nn.Cell):
moe_delay_allreduce=True,
)
self.w3 = ColumnParallelLinear(
self.hidden_size,
self.ffn_hidden_size,
config=self.config.parallel_config,
bias=self.has_bias,
transpose_b=True,
gather_output=False,
param_init_type=self.config.param_init_dtype,
compute_dtype=self.config.compute_dtype,
is_expert=True,
expert_num=self.config.moe_config.expert_num,
)
def construct(self, x, group_list=None):
"""Forward process of the FeedForward"""
x = self.cast(x, self.config.compute_dtype)
gate = self.w1(x, group_list=group_list)
if self.ffn_concat:
gate_hidden_out = self.w_gate_hidden(x, group_list=group_list) # dp,1 -> dp, mp # dp,1 -> dp, mp
gate, hidden = mint.split(gate_hidden_out,
(self.ffn_hidden_size_per_partition, self.ffn_hidden_size_per_partition), -1)
else:
gate = self.w1(x, group_list=group_list)
hidden = self.w3(x, group_list=group_list)
gate = self.act_func(gate)
hidden = self.w3(x, group_list=group_list)
hidden = mint.mul(hidden, gate)
output = self.w2(hidden, group_list=group_list)
output = self.w2(hidden, group_list)
return output

View File

@@ -18,10 +18,10 @@ import math
import mindspore.common.dtype as mstype
from mindspore import ops as P
from mindspore import nn
from mindspore import nn, Parameter
from mindspore.common.initializer import initializer
from mindformers.experimental.infer.core.utils import create_empty_parameter
class ParallelPagedAttentionMgr(nn.Cell):
"""Paged Attention Manager."""
def __init__(self,
@@ -59,6 +59,12 @@ class ParallelPagedAttentionMgr(nn.Cell):
name="value_cache",
requires_grad=False,
)
elif self.npu_mem_size == 0:
self.key_cache = Parameter(initializer('zeros', kv_shape, compute_dtype), name="key_cache",
requires_grad=False)
self.value_cache = Parameter(initializer('zeros', kv_shape, compute_dtype), name="value_cache",
requires_grad=False)
self.reshape_and_cache = P.auto_generate.ReshapeAndCache()
self.paged_attention = P.auto_generate.PagedAttention(self.n_heads,
self.scale_value,

View File

@@ -112,6 +112,7 @@ class LlamaConfig(PretrainedConfig):
calculate_per_token_loss (bool, optional): Whether to calculate the loss of each token. Default: ``False``.
pipeline_stage (dict, optional): A dict set the start_stage, stage_num, and offset of the model when
pipeline parallelism. Default: ``None``.
return_hidden_states (bool, optional): Whether to return hidden states. Default: ``False``.
Returns:
LlamaConfig, a LlamaConfig instance.
@@ -191,6 +192,7 @@ class LlamaConfig(PretrainedConfig):
chunk_prefill: bool = False,
calculate_per_token_loss: bool = False,
pipeline_stage: dict = None,
return_hidden_states: bool = False,
**kwargs):
"""
Note:
@@ -267,6 +269,7 @@ class LlamaConfig(PretrainedConfig):
self.input_sliced_sig = input_sliced_sig
self.rmsnorm_compute_2d = rmsnorm_compute_2d
self.chunk_prefill = chunk_prefill
self.return_hidden_states = return_hidden_states
self.calculate_per_token_loss = calculate_per_token_loss
if (pipeline_stage is not None and
pipeline_stage["start_stage"] + pipeline_stage["stage_num"] <= parallel_config.pipeline_stage):

View File

@@ -28,16 +28,16 @@ from mindformers.version_control import synchronize
def get_profile_settings():
"""Get profile settings for context."""
context_module = import_module("mindformers.core.context.build_context")
context_instance = context_module.Context()
mf_ctx_instance = context_module.MFContextOperator.get_mf_ctx_instance()
profile = False
profile_start_step = 0
profile_stop_step = 0
profile_level = 1
if context_instance is not None and context_instance.is_exists():
profile = context_module.get_context("profile")
profile_start_step = context_module.get_context("profile_start_step")
profile_stop_step = context_module.get_context("profile_stop_step")
profile_level = context_module.get_context("profile_level") or profile_level
if mf_ctx_instance is not None:
profile = getattr(mf_ctx_instance, 'profile', profile)
profile_start_step = getattr(mf_ctx_instance, 'profile_start_step', profile_start_step)
profile_stop_step = getattr(mf_ctx_instance, 'profile_stop_step', profile_stop_step)
profile_level = getattr(mf_ctx_instance, 'profile_level', profile_level)
max_level = len(ProfilerLevel.__members__) - 1
if profile_level < 0 or profile_level > max_level:
logger.warning("Invalid profile level: %s, return level1.", profile_level)

View File

@@ -51,7 +51,7 @@ from mindformers.modules.transformer.moe import MoEV2
from mindformers.modules.transformer.moe import MoEInfer
from mindformers.version_control import check_seqpp_fa_opt_support
from deepseek2_config import DeepseekV2Config
from research.deepseek3.deepseek2_config import DeepseekV2Config
__all__ = ['DeepseekV2ForCausalLM', 'DeepseekV2Model']

View File

@@ -17,8 +17,8 @@ import os
from mindformers.tools.register import MindFormerRegister, MindFormerModuleType
from deepseek3_model_train import TrainingDeepseekV3ForCausalLM
from deepseek3_model_infer import InferenceDeepseekV3ForCausalLM
from research.deepseek3.deepseek3_model_train import TrainingDeepseekV3ForCausalLM
from research.deepseek3.deepseek3_model_infer import InferenceDeepseekV3ForCausalLM
__all__ = ['DeepseekV3ForCausalLM']

View File

@@ -10,7 +10,7 @@ auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distr
# trainer config
trainer:
type: CausalLanguageModelingTrainer
model_name: 'DeepseekV3'
model_name: 'DeepSeekV3'
# default parallel of device num = 32 for Atlas 800T A2
parallel_config:
@@ -22,8 +22,13 @@ parallel_config:
# mindspore context init config
context:
mode: 0 # 0--Graph Mode; 1--Pynative Mode
max_device_memory: "58GB"
max_device_memory: "61GB"
device_id: 0
affinity_cpu_list: None
kernel_launch_group:
thread_num: 4
kernel_group_num: 16
# parallel context config
parallel:
@@ -83,6 +88,7 @@ model:
do_sample: False
is_dynamic: True
qkv_concat: False
ffn_concat: False
auto_map:
AutoConfig: deepseek3_config.DeepseekV3Config
AutoModel: deepseek3.DeepseekV3ForCausalLM

View File

@@ -4,17 +4,17 @@ run_mode: 'predict'
use_parallel: True
load_checkpoint: "/path/to/deepseekv3/model_w8a8_ckpt"
load_ckpt_format: "ckpt"
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
load_ckpt_format: "safetensors"
auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distributed model
# trainer config
trainer:
type: CausalLanguageModelingTrainer
model_name: 'DeepseekV3-W8A8'
model_name: 'DeepSeekV3-W8A8'
# default parallel of device num = 32 for Atlas 800T A2
# default parallel of device num = 16 for Atlas 800T A2
parallel_config:
model_parallel: 32
model_parallel: 16
pipeline_stage: 1
expert_parallel: 1
vocab_emb_dp: False
@@ -22,8 +22,13 @@ parallel_config:
# mindspore context init config
context:
mode: 0 # 0--Graph Mode; 1--Pynative Mode
max_device_memory: "58GB"
max_device_memory: "61GB"
device_id: 0
affinity_cpu_list: None
kernel_launch_group:
thread_num: 4
kernel_group_num: 16
# parallel context config
parallel:
@@ -83,6 +88,7 @@ model:
do_sample: False
is_dynamic: True
qkv_concat: False
ffn_concat: False
quantization_config:
quant_method: 'ptq'
weight_dtype: 'int8'

View File

@@ -23,7 +23,7 @@ from mindformers.modules.transformer.transformer import default_transformer_conf
from mindformers.models.utils import convert_mstype
from mindformers.tools.register import MindFormerRegister, MindFormerModuleType
from mindformers.mindformer_book import MindFormerBook
from deepseek2_config import DeepseekV2Config
from research.deepseek3.deepseek2_config import DeepseekV2Config
__all__ = ['DeepseekV3Config']
@@ -135,6 +135,7 @@ class DeepseekV3Config(DeepseekV2Config):
init_method_std=0.006,
qkv_has_bias=False,
qkv_concat=False,
ffn_concat=False,
parallel_config: Union[dict, TransformerOpParallelConfig] = default_transformer_config,
moe_config: Union[dict, MoEConfig] = default_moe_config,
use_past: bool = False,
@@ -160,6 +161,7 @@ class DeepseekV3Config(DeepseekV2Config):
use_fused_rope=False,
use_fused_swiglu=False,
enable_fa_var_len=False,
return_hidden_states=False,
**kwargs):
super(DeepseekV3Config, self).__init__(**kwargs)
if isinstance(parallel_config, dict):
@@ -187,6 +189,7 @@ class DeepseekV3Config(DeepseekV2Config):
self.ffn_dim_multiplier = ffn_dim_multiplier
self.rms_norm_eps = rms_norm_eps
self.qkv_concat = qkv_concat
self.ffn_concat = ffn_concat
self.param_init_type = convert_mstype(param_init_type)
self.qkv_has_bias = qkv_has_bias
self.layernorm_compute_type = convert_mstype(layernorm_compute_type)
@@ -226,3 +229,4 @@ class DeepseekV3Config(DeepseekV2Config):
self.use_fused_rope = use_fused_rope
self.use_fused_swiglu = use_fused_swiglu
self.enable_fa_var_len = enable_fa_var_len
self.return_hidden_states = return_hidden_states

View File

@@ -24,6 +24,7 @@ from mindspore.ops import operations as P
from mindspore.nn.cell import Cell
from mindspore.common.initializer import Zero
from mindspore.communication._comm_helper import _is_initialized
try:
from mindspore._checkparam import Validator
except ImportError:
@@ -45,8 +46,8 @@ from mindformers.experimental.infer.core.norm import RMSNorm
from mindformers.experimental.infer.core.moe import RoutedParallelMLP, SharedParallelMLP, ParallelMoEV2
from mindformers.experimental.infer.core.transformer import ParallelMLP, VocabEmbedding
from deepseek3_config import DeepseekV3Config
from utils import convert_model_config
from research.deepseek3.deepseek3_config import DeepseekV3Config
from research.deepseek3.utils import convert_model_config
__all__ = ['InferenceDeepseekV3ForCausalLM', 'DeepseekV3Model']
@@ -420,7 +421,6 @@ class DeepseekV3Attention(nn.Cell):
.format(self.n_kv_head, parallel_config.model_parallel))
self.shape = P.Shape()
self.cast = P.Cast()
if self.q_lora_rank == 0:
self.q_proj = ColumnParallelLinear(
self.hidden_size,
@@ -609,6 +609,46 @@ class DeepseekV3Attention(nn.Cell):
return output
class DeepseekV3ParallelMLP(ParallelMLP):
r"""
Implementation of parallel feedforward block.
Args:
config (dict): Configuration.
is_expert (book): This block is an expert block. Default: False.
Inputs:
- **hidden_states** (Tensor) - Tensor of shape :math:`(B, S, H)`.
Outputs:
- **output** (Tensor) - Output tensor of shape :math:`(B, S, H)`.
Supported Platforms:
``Ascend``
"""
def __init__(self, config, is_expert=False):
super().__init__(config)
if is_expert:
raise NotImplementedError("For ParallelMLP, `is_expert` is not supported for now.")
def construct(self, x):
""" Construct function of mlp block. """
# [B, S, H] -> [B, S, ffn_H]
if self.ffn_concat:
gate_hidden_out = self.w_gate_hidden(x) # dp,1 -> dp, mp # dp,1 -> dp, mp
gate, hidden = mint.split(gate_hidden_out,
(self.ffn_hidden_size_per_partition, self.ffn_hidden_size_per_partition), -1)
else:
gate = self.w1(x) # dp,1 -> dp, mp
hidden = self.w3(x) # dp,1 -> dp, mp
gate = self.act_func(gate)
hidden = mint.mul(hidden, gate)
# [B, S, ffn_H] -> [B, S, H]
output = self.w2(hidden)
return output
class DeepseekV3MoE(Cell):
r"""
This is an implementation of self-attention mechanism in DeepSeek-V3.
@@ -795,7 +835,7 @@ class DeepseekV3DecodeLayer(nn.Cell):
if self.first_k_dense:
logger.warning("first_k_dense_replace is provided in MoEConfig, "
"a normal dense FFN will be used in this block.")
self.feed_forward = ParallelMLP(config)
self.feed_forward = DeepseekV3ParallelMLP(config)
else:
self.feed_forward = DeepseekV3MoE(config=config)
@@ -1069,21 +1109,32 @@ class InferenceDeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):
self.gather = P.Gather()
self.sub_batch_valid_len = P.Sub()
self.model = DeepseekV3Model(config=config)
self.lm_head = ColumnParallelLinear(
config.hidden_size,
config.vocab_size,
config=config.parallel_config,
bias=False,
param_init_type=config.param_init_type,
compute_dtype=config.compute_dtype,
weight_init="normal",
gather_output=True
)
if config.parallel_config.vocab_emb_dp:
self.lm_head = Linear(
in_channels=config.hidden_size,
out_channels=config.vocab_size,
weight_init="normal",
has_bias=False,
param_init_type=config.param_init_type,
compute_dtype=config.compute_dtype
)
else:
self.lm_head = ColumnParallelLinear(
config.hidden_size,
config.vocab_size,
config=config.parallel_config,
bias=False,
param_init_type=config.param_init_type,
compute_dtype=config.compute_dtype,
weight_init="normal",
gather_output=True
)
self.prefill_gather_flatten = P.Gather()
self.load_checkpoint(config)
self.predict_run_mode = get_predict_run_mode()
logger.info("Predict run mode:{}".format(self.predict_run_mode))
self.return_hidden_states = config.return_hidden_states
# pylint: disable=W0613
def prepare_inputs_for_predict_layout(self, input_ids, **kwargs):
@@ -1133,6 +1184,9 @@ class InferenceDeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):
batch_valid_length = self.reshape(batch_valid_length, (-1,))
output = self.model(tokens, batch_valid_length, batch_index, zactivate_len, block_tables,
slot_mapping)
if self.return_hidden_states:
output = self.reshape(output, (-1, output.shape[-1]))
return output
pre_gather = (not self.use_past or self.is_first_iteration) and batch_valid_length is not None
output = self.pre_gather_func(pre_gather, output, batch_valid_length)
logits = self.lm_head(output)
@@ -1147,3 +1201,8 @@ class InferenceDeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):
logits = self.reshape(logits, (-1, logits.shape[-1]))
return logits
return logits, tokens, input_mask
def kvcache(self, layer_idx):
"""Get the key_cache depend on layer_idx."""
key_cache = self.model.layers[layer_idx].attention.infer_attention.paged_attention_mgr.key_cache
return key_cache, None

View File

@@ -17,7 +17,7 @@ import mindspore as ms
from mindspore.common import dtype as mstype
from mindspore.ops import operations as P
from deepseek2_model import DeepseekV2ForCausalLM
from research.deepseek3.deepseek2_model import DeepseekV2ForCausalLM
class TrainingDeepseekV3ForCausalLM(DeepseekV2ForCausalLM):

View File

@@ -0,0 +1,121 @@
seed: 0
output_dir: './output' # path to save checkpoint/strategy
run_mode: 'predict'
use_parallel: True
load_checkpoint: "/path/to/deepseekr1/model_ckpt"
load_ckpt_format: "safetensors"
auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distributed model
# trainer config
trainer:
type: CausalLanguageModelingTrainer
model_name: 'DeepSeekR1'
# default parallel of device num = 32 for Atlas 800T A2
parallel_config:
model_parallel: 32
pipeline_stage: 1
expert_parallel: 1
vocab_emb_dp: False
# mindspore context init config
context:
mode: 0 # 0--Graph Mode; 1--Pynative Mode
max_device_memory: "61GB"
device_id: 0
affinity_cpu_list: None
kernel_launch_group:
thread_num: 4
kernel_group_num: 16
# parallel context config
parallel:
parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend
full_batch: False
strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
# model config
model:
model_config:
type: DeepseekV3Config
auto_register: deepseek3_config.DeepseekV3Config
batch_size: 1 # add for incre predict
seq_length: 4096
hidden_size: 7168
num_layers: 61
num_heads: 128
max_position_embeddings: 163840
intermediate_size: 18432
kv_lora_rank: 512
q_lora_rank: 1536
qk_rope_head_dim: 64
v_head_dim: 128
qk_nope_head_dim: 128
vocab_size: 129280
multiple_of: 256
rms_norm_eps: 1.0e-6
bos_token_id: 0
eos_token_id: 1
pad_token_id: 1
ignore_token_id: -100
compute_dtype: "bfloat16"
layernorm_compute_type: "bfloat16"
softmax_compute_type: "bfloat16"
rotary_dtype: "bfloat16"
router_dense_type: "bfloat16"
param_init_type: "bfloat16"
scaling_factor:
beta_fast: 32.0
beta_slow: 1.0
factor: 40.0
mscale: 1.0
mscale_all_dim: 1.0
original_max_position_embeddings: 4096
use_past: True
extend_method: "YARN"
use_flash_attention: True
block_size: 16
num_blocks: 512
offset: 0
checkpoint_name_or_path: ""
repetition_penalty: 1
max_decode_length: 1024
top_k: 1
top_p: 1
theta: 10000.0
do_sample: False
is_dynamic: True
qkv_concat: False
ffn_concat: False
auto_map:
AutoConfig: deepseek3_config.DeepseekV3Config
AutoModel: deepseek3.DeepseekV3ForCausalLM
arch:
type: DeepseekV3ForCausalLM
auto_register: deepseek3.DeepseekV3ForCausalLM
moe_config:
expert_num: 256
num_experts_chosen: 8
routing_policy: "TopkRouterV2"
shared_expert_num: 1
routed_scaling_factor: 2.5
first_k_dense_replace: 3
moe_intermediate_size: 2048
topk_group: 4
n_group: 8
processor:
return_tensors: ms
tokenizer:
unk_token: '<unk>'
bos_token: '<begin▁of▁sentence>'
eos_token: '<end▁of▁sentence>'
pad_token: '<end▁of▁sentence>'
type: LlamaTokenizerFast
vocab_file: '/path/to/deepseekr1/tokenizer.json'
tokenizer_file: '/path/to/deepseekr1/tokenizer.json'
chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<User>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<Assistant><tool▁calls▁begin><tool▁call▁begin>' + tool['type'] + '<tool▁sep>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<tool▁call▁end>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<tool▁call▁begin>' + tool['type'] + '<tool▁sep>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<tool▁call▁end>'}}{{'<tool▁calls▁end><end▁of▁sentence>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<tool▁outputs▁end>' + message['content'] + '<end▁of▁sentence>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<Assistant>' + message['content'] + '<end▁of▁sentence>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<tool▁outputs▁begin><tool▁output▁begin>' + message['content'] + '<tool▁output▁end>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<tool▁output▁begin>' + message['content'] + '<tool▁output▁end>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<tool▁outputs▁end>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<Assistant>'}}{% endif %}"
type: LlamaProcessor

View File

@@ -0,0 +1,125 @@
seed: 0
output_dir: './output' # path to save checkpoint/strategy
run_mode: 'predict'
use_parallel: True
load_checkpoint: "/path/to/deepseekr1/model_w8a8_ckpt"
load_ckpt_format: "safetensors"
auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distributed model
# trainer config
trainer:
type: CausalLanguageModelingTrainer
model_name: 'DeepSeekR1-W8A8'
# default parallel of device num = 16 for Atlas 800T A2
parallel_config:
model_parallel: 16
pipeline_stage: 1
expert_parallel: 1
vocab_emb_dp: False
# mindspore context init config
context:
mode: 0 # 0--Graph Mode; 1--Pynative Mode
max_device_memory: "61GB"
device_id: 0
affinity_cpu_list: None
kernel_launch_group:
thread_num: 4
kernel_group_num: 16
# parallel context config
parallel:
parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend
full_batch: False
strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
# model config
model:
model_config:
type: DeepseekV3Config
auto_register: deepseek3_config.DeepseekV3Config
batch_size: 1 # add for incre predict
seq_length: 4096
hidden_size: 7168
num_layers: 61
num_heads: 128
max_position_embeddings: 163840
intermediate_size: 18432
kv_lora_rank: 512
q_lora_rank: 1536
qk_rope_head_dim: 64
v_head_dim: 128
qk_nope_head_dim: 128
vocab_size: 129280
multiple_of: 256
rms_norm_eps: 1.0e-6
bos_token_id: 0
eos_token_id: 1
pad_token_id: 1
ignore_token_id: -100
compute_dtype: "bfloat16"
layernorm_compute_type: "bfloat16"
softmax_compute_type: "bfloat16"
rotary_dtype: "bfloat16"
router_dense_type: "bfloat16"
param_init_type: "bfloat16"
scaling_factor:
beta_fast: 32.0
beta_slow: 1.0
factor: 40.0
mscale: 1.0
mscale_all_dim: 1.0
original_max_position_embeddings: 4096
use_past: True
extend_method: "YARN"
use_flash_attention: True
block_size: 16
num_blocks: 512
offset: 0
checkpoint_name_or_path: ""
repetition_penalty: 1
max_decode_length: 1024
top_k: 1
top_p: 1
theta: 10000.0
do_sample: False
is_dynamic: True
qkv_concat: False
ffn_concat: False
quantization_config:
quant_method: 'ptq'
weight_dtype: 'int8'
activation_dtype: 'int8'
auto_map:
AutoConfig: deepseek3_config.DeepseekV3Config
AutoModel: deepseek3.DeepseekV3ForCausalLM
arch:
type: DeepseekV3ForCausalLM
auto_register: deepseek3.DeepseekV3ForCausalLM
moe_config:
expert_num: 256
num_experts_chosen: 8
routing_policy: "TopkRouterV2"
shared_expert_num: 1
routed_scaling_factor: 2.5
first_k_dense_replace: 3
moe_intermediate_size: 2048
topk_group: 4
n_group: 8
processor:
return_tensors: ms
tokenizer:
unk_token: '<unk>'
bos_token: '<begin▁of▁sentence>'
eos_token: '<end▁of▁sentence>'
pad_token: '<end▁of▁sentence>'
type: LlamaTokenizerFast
vocab_file: '/path/to/deepseekr1/tokenizer.json'
tokenizer_file: '/path/to/deepseekr1/tokenizer.json'
chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<User>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<Assistant><tool▁calls▁begin><tool▁call▁begin>' + tool['type'] + '<tool▁sep>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<tool▁call▁end>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<tool▁call▁begin>' + tool['type'] + '<tool▁sep>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<tool▁call▁end>'}}{{'<tool▁calls▁end><end▁of▁sentence>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<tool▁outputs▁end>' + message['content'] + '<end▁of▁sentence>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<Assistant>' + message['content'] + '<end▁of▁sentence>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<tool▁outputs▁begin><tool▁output▁begin>' + message['content'] + '<tool▁output▁end>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<tool▁output▁begin>' + message['content'] + '<tool▁output▁end>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<tool▁outputs▁end>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<Assistant>'}}{% endif %}"
type: LlamaProcessor

View File

@@ -40,7 +40,6 @@ def convert_model_config(configs):
configs.mlp_has_gate = True
configs.post_norm = True
configs.recompute_granularity = None
configs.ffn_concat = False
configs.is_dynamic = True
parallel_config = configs.parallel_config

View File

@@ -110,6 +110,7 @@ class ParallelQwenForCausalLM(LlamaPreTrainedModel):
self.npu_mem_size = config.npu_mem_size if hasattr(config, "npu_mem_size") else 2
if config.tie_word_embeddings:
self.lm_head.weight = self.model.tok_embeddings.embedding_weight
self.return_hidden_states = config.return_hidden_states
# pylint: disable=W0613
def prepare_inputs_for_predict_layout(self, input_ids, **kwargs):
@@ -130,7 +131,7 @@ class ParallelQwenForCausalLM(LlamaPreTrainedModel):
have_prefix_keys_values = getattr(kwargs, "have_prefix_keys_values", False)
def get_input():
if self.npu_mem_size > 0:
if self.npu_mem_size >= 0:
return None
cache_list = []
for _ in self.model.layers:
@@ -175,6 +176,9 @@ class ParallelQwenForCausalLM(LlamaPreTrainedModel):
batch_valid_length = self.reshape(batch_valid_length, (-1,))
output = self.model(input_ids, batch_valid_length, batch_index, zactivate_len, block_tables,
slot_mapping, prefix_keys_values, key_cache=key_cache, value_cache=value_cache)
if self.return_hidden_states:
output = self.reshape(output, (-1, output.shape[-1]))
return output
pre_gather = (not self.use_past or self.is_first_iteration) and batch_valid_length is not None
if pre_gather:
if not self.is_pynative:

View File

@@ -2274,7 +2274,7 @@
"signature": "(config: mindformers.models.configuration_utils.PretrainedConfig, *inputs, **kwargs)"
},
"mindformers.experimental.model.llama.LlamaPretrainedModel.config_class": {
"signature": "(batch_size: int = 1, seq_length: int = 2048, hidden_size: int = 4096, num_layers: int = 32, num_heads: int = 32, n_kv_heads: Optional[int] = None, max_position_embedding: Optional[int] = None, intermediate_size: Optional[int] = None, vocab_size: int = 32000, multiple_of: int = 256, ffn_dim_multiplier: Optional[int] = None, rms_norm_eps: float = 1e-05, bos_token_id: int = 1, eos_token_id: int = 2, pad_token_id: int = 0, ignore_token_id: int = -100, theta: float = 10000.0, compute_dtype: str = 'float16', layernorm_compute_type: str = 'float32', softmax_compute_type: str = 'float32', rotary_dtype: str = 'float32', param_init_type: str = 'float16', residual_dtype: str = None, embedding_init_type=None, qkv_has_bias: bool = False, qkv_concat: bool = False, attn_proj_has_bias: bool = False, parallel_config: Union[dict, mindformers.modules.transformer.transformer.TransformerOpParallelConfig] = <mindformers.modules.transformer.transformer.TransformerOpParallelConfig object>, moe_config: Union[dict, mindformers.modules.transformer.moe.MoEConfig] = <mindformers.modules.transformer.moe.MoEConfig object>, use_past: bool = False, extend_method: str = 'None', scaling_factor: float = 1.0, is_dynamic: bool = False, use_rope_slice: bool = False, use_flash_attention: bool = False, use_ring_attention: bool = False, use_attn_mask_compression: bool = False, use_eod_attn_mask_compression: bool = False, parallel_optimizer: bool = False, fine_grain_interleave: int = 1, pp_interleave_num: int = 1, offset: int = 0, init_method_std: float = 0.01, checkpoint_name_or_path: str = '', repetition_penalty: float = 1.0, max_decode_length: int = 1024, block_size: int = 16, num_blocks: int = 512, top_k: int = 5, top_p: float = 1.0, do_sample: bool = True, quant_config: dict = None, tie_word_embeddings: bool = False, llm_backend: str = '', fused_rms_norm: bool = True, input_sliced_sig: bool = False, rmsnorm_compute_2d: bool = False, chunk_prefill: bool = False, calculate_per_token_loss: bool = False, pipeline_stage: dict = None, **kwargs)"
"signature": "(batch_size: int = 1, seq_length: int = 2048, hidden_size: int = 4096, num_layers: int = 32, num_heads: int = 32, n_kv_heads: Optional[int] = None, max_position_embedding: Optional[int] = None, intermediate_size: Optional[int] = None, vocab_size: int = 32000, multiple_of: int = 256, ffn_dim_multiplier: Optional[int] = None, rms_norm_eps: float = 1e-05, bos_token_id: int = 1, eos_token_id: int = 2, pad_token_id: int = 0, ignore_token_id: int = -100, theta: float = 10000.0, compute_dtype: str = 'float16', layernorm_compute_type: str = 'float32', softmax_compute_type: str = 'float32', rotary_dtype: str = 'float32', param_init_type: str = 'float16', residual_dtype: str = None, embedding_init_type=None, qkv_has_bias: bool = False, qkv_concat: bool = False, attn_proj_has_bias: bool = False, parallel_config: Union[dict, mindformers.modules.transformer.transformer.TransformerOpParallelConfig] = <mindformers.modules.transformer.transformer.TransformerOpParallelConfig object>, moe_config: Union[dict, mindformers.modules.transformer.moe.MoEConfig] = <mindformers.modules.transformer.moe.MoEConfig object>, use_past: bool = False, extend_method: str = 'None', scaling_factor: float = 1.0, is_dynamic: bool = False, use_rope_slice: bool = False, use_flash_attention: bool = False, use_ring_attention: bool = False, use_attn_mask_compression: bool = False, use_eod_attn_mask_compression: bool = False, parallel_optimizer: bool = False, fine_grain_interleave: int = 1, pp_interleave_num: int = 1, offset: int = 0, init_method_std: float = 0.01, checkpoint_name_or_path: str = '', repetition_penalty: float = 1.0, max_decode_length: int = 1024, block_size: int = 16, num_blocks: int = 512, top_k: int = 5, top_p: float = 1.0, do_sample: bool = True, quant_config: dict = None, tie_word_embeddings: bool = False, llm_backend: str = '', fused_rms_norm: bool = True, input_sliced_sig: bool = False, rmsnorm_compute_2d: bool = False, chunk_prefill: bool = False, calculate_per_token_loss: bool = False, pipeline_stage: dict = None, return_hidden_states: bool = False, **kwargs)"
},
"mindformers.experimental.model.llama.llama.LlamaForCausalLM": {
"signature": "(config: mindformers.models.llama.llama_config.LlamaConfig, num_tokentypes: int = 0, parallel_output: bool = True, pre_process: bool = True, post_process: bool = True, **kwargs)"
@@ -2304,7 +2304,7 @@
"signature": "(config: mindformers.models.configuration_utils.PretrainedConfig, *inputs, **kwargs)"
},
"mindformers.experimental.model.llama.llama.LlamaPretrainedModel.config_class": {
"signature": "(batch_size: int = 1, seq_length: int = 2048, hidden_size: int = 4096, num_layers: int = 32, num_heads: int = 32, n_kv_heads: Optional[int] = None, max_position_embedding: Optional[int] = None, intermediate_size: Optional[int] = None, vocab_size: int = 32000, multiple_of: int = 256, ffn_dim_multiplier: Optional[int] = None, rms_norm_eps: float = 1e-05, bos_token_id: int = 1, eos_token_id: int = 2, pad_token_id: int = 0, ignore_token_id: int = -100, theta: float = 10000.0, compute_dtype: str = 'float16', layernorm_compute_type: str = 'float32', softmax_compute_type: str = 'float32', rotary_dtype: str = 'float32', param_init_type: str = 'float16', residual_dtype: str = None, embedding_init_type=None, qkv_has_bias: bool = False, qkv_concat: bool = False, attn_proj_has_bias: bool = False, parallel_config: Union[dict, mindformers.modules.transformer.transformer.TransformerOpParallelConfig] = <mindformers.modules.transformer.transformer.TransformerOpParallelConfig object>, moe_config: Union[dict, mindformers.modules.transformer.moe.MoEConfig] = <mindformers.modules.transformer.moe.MoEConfig object>, use_past: bool = False, extend_method: str = 'None', scaling_factor: float = 1.0, is_dynamic: bool = False, use_rope_slice: bool = False, use_flash_attention: bool = False, use_ring_attention: bool = False, use_attn_mask_compression: bool = False, use_eod_attn_mask_compression: bool = False, parallel_optimizer: bool = False, fine_grain_interleave: int = 1, pp_interleave_num: int = 1, offset: int = 0, init_method_std: float = 0.01, checkpoint_name_or_path: str = '', repetition_penalty: float = 1.0, max_decode_length: int = 1024, block_size: int = 16, num_blocks: int = 512, top_k: int = 5, top_p: float = 1.0, do_sample: bool = True, quant_config: dict = None, tie_word_embeddings: bool = False, llm_backend: str = '', fused_rms_norm: bool = True, input_sliced_sig: bool = False, rmsnorm_compute_2d: bool = False, chunk_prefill: bool = False, calculate_per_token_loss: bool = False, pipeline_stage: dict = None, **kwargs)"
"signature": "(batch_size: int = 1, seq_length: int = 2048, hidden_size: int = 4096, num_layers: int = 32, num_heads: int = 32, n_kv_heads: Optional[int] = None, max_position_embedding: Optional[int] = None, intermediate_size: Optional[int] = None, vocab_size: int = 32000, multiple_of: int = 256, ffn_dim_multiplier: Optional[int] = None, rms_norm_eps: float = 1e-05, bos_token_id: int = 1, eos_token_id: int = 2, pad_token_id: int = 0, ignore_token_id: int = -100, theta: float = 10000.0, compute_dtype: str = 'float16', layernorm_compute_type: str = 'float32', softmax_compute_type: str = 'float32', rotary_dtype: str = 'float32', param_init_type: str = 'float16', residual_dtype: str = None, embedding_init_type=None, qkv_has_bias: bool = False, qkv_concat: bool = False, attn_proj_has_bias: bool = False, parallel_config: Union[dict, mindformers.modules.transformer.transformer.TransformerOpParallelConfig] = <mindformers.modules.transformer.transformer.TransformerOpParallelConfig object>, moe_config: Union[dict, mindformers.modules.transformer.moe.MoEConfig] = <mindformers.modules.transformer.moe.MoEConfig object>, use_past: bool = False, extend_method: str = 'None', scaling_factor: float = 1.0, is_dynamic: bool = False, use_rope_slice: bool = False, use_flash_attention: bool = False, use_ring_attention: bool = False, use_attn_mask_compression: bool = False, use_eod_attn_mask_compression: bool = False, parallel_optimizer: bool = False, fine_grain_interleave: int = 1, pp_interleave_num: int = 1, offset: int = 0, init_method_std: float = 0.01, checkpoint_name_or_path: str = '', repetition_penalty: float = 1.0, max_decode_length: int = 1024, block_size: int = 16, num_blocks: int = 512, top_k: int = 5, top_p: float = 1.0, do_sample: bool = True, quant_config: dict = None, tie_word_embeddings: bool = False, llm_backend: str = '', fused_rms_norm: bool = True, input_sliced_sig: bool = False, rmsnorm_compute_2d: bool = False, chunk_prefill: bool = False, calculate_per_token_loss: bool = False, pipeline_stage: dict = None, return_hidden_states: bool = False, **kwargs)"
},
"mindformers.experimental.models.Qwen2ForCausalLM": {
"signature": "(config)"
@@ -4119,7 +4119,7 @@
"signature": "(self, conversation, return_tensors=None, **tokenizer_kwargs)"
},
"mindformers.models.LlamaConfig": {
"signature": "(batch_size: int = 1, seq_length: int = 2048, hidden_size: int = 4096, num_layers: int = 32, num_heads: int = 32, n_kv_heads: Optional[int] = None, max_position_embedding: Optional[int] = None, intermediate_size: Optional[int] = None, vocab_size: int = 32000, multiple_of: int = 256, ffn_dim_multiplier: Optional[int] = None, rms_norm_eps: float = 1e-05, bos_token_id: int = 1, eos_token_id: int = 2, pad_token_id: int = 0, ignore_token_id: int = -100, theta: float = 10000.0, compute_dtype: str = 'float16', layernorm_compute_type: str = 'float32', softmax_compute_type: str = 'float32', rotary_dtype: str = 'float32', param_init_type: str = 'float16', residual_dtype: str = None, embedding_init_type=None, qkv_has_bias: bool = False, qkv_concat: bool = False, attn_proj_has_bias: bool = False, parallel_config: Union[dict, mindformers.modules.transformer.transformer.TransformerOpParallelConfig] = <mindformers.modules.transformer.transformer.TransformerOpParallelConfig object>, moe_config: Union[dict, mindformers.modules.transformer.moe.MoEConfig] = <mindformers.modules.transformer.moe.MoEConfig object>, use_past: bool = False, extend_method: str = 'None', scaling_factor: float = 1.0, is_dynamic: bool = False, use_rope_slice: bool = False, use_flash_attention: bool = False, use_ring_attention: bool = False, use_attn_mask_compression: bool = False, use_eod_attn_mask_compression: bool = False, parallel_optimizer: bool = False, fine_grain_interleave: int = 1, pp_interleave_num: int = 1, offset: int = 0, init_method_std: float = 0.01, checkpoint_name_or_path: str = '', repetition_penalty: float = 1.0, max_decode_length: int = 1024, block_size: int = 16, num_blocks: int = 512, top_k: int = 5, top_p: float = 1.0, do_sample: bool = True, quant_config: dict = None, tie_word_embeddings: bool = False, llm_backend: str = '', fused_rms_norm: bool = True, input_sliced_sig: bool = False, rmsnorm_compute_2d: bool = False, chunk_prefill: bool = False, calculate_per_token_loss: bool = False, pipeline_stage: dict = None, **kwargs)"
"signature": "(batch_size: int = 1, seq_length: int = 2048, hidden_size: int = 4096, num_layers: int = 32, num_heads: int = 32, n_kv_heads: Optional[int] = None, max_position_embedding: Optional[int] = None, intermediate_size: Optional[int] = None, vocab_size: int = 32000, multiple_of: int = 256, ffn_dim_multiplier: Optional[int] = None, rms_norm_eps: float = 1e-05, bos_token_id: int = 1, eos_token_id: int = 2, pad_token_id: int = 0, ignore_token_id: int = -100, theta: float = 10000.0, compute_dtype: str = 'float16', layernorm_compute_type: str = 'float32', softmax_compute_type: str = 'float32', rotary_dtype: str = 'float32', param_init_type: str = 'float16', residual_dtype: str = None, embedding_init_type=None, qkv_has_bias: bool = False, qkv_concat: bool = False, attn_proj_has_bias: bool = False, parallel_config: Union[dict, mindformers.modules.transformer.transformer.TransformerOpParallelConfig] = <mindformers.modules.transformer.transformer.TransformerOpParallelConfig object>, moe_config: Union[dict, mindformers.modules.transformer.moe.MoEConfig] = <mindformers.modules.transformer.moe.MoEConfig object>, use_past: bool = False, extend_method: str = 'None', scaling_factor: float = 1.0, is_dynamic: bool = False, use_rope_slice: bool = False, use_flash_attention: bool = False, use_ring_attention: bool = False, use_attn_mask_compression: bool = False, use_eod_attn_mask_compression: bool = False, parallel_optimizer: bool = False, fine_grain_interleave: int = 1, pp_interleave_num: int = 1, offset: int = 0, init_method_std: float = 0.01, checkpoint_name_or_path: str = '', repetition_penalty: float = 1.0, max_decode_length: int = 1024, block_size: int = 16, num_blocks: int = 512, top_k: int = 5, top_p: float = 1.0, do_sample: bool = True, quant_config: dict = None, tie_word_embeddings: bool = False, llm_backend: str = '', fused_rms_norm: bool = True, input_sliced_sig: bool = False, rmsnorm_compute_2d: bool = False, chunk_prefill: bool = False, calculate_per_token_loss: bool = False, pipeline_stage: dict = None, return_hidden_states: bool = False, **kwargs)"
},
"mindformers.models.LlamaForCausalLM": {
"signature": "(config: mindformers.models.llama.llama_config.LlamaConfig = None)"
@@ -5622,7 +5622,7 @@
"signature": "(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]"
},
"mindformers.models.llama.LlamaConfig": {
"signature": "(batch_size: int = 1, seq_length: int = 2048, hidden_size: int = 4096, num_layers: int = 32, num_heads: int = 32, n_kv_heads: Optional[int] = None, max_position_embedding: Optional[int] = None, intermediate_size: Optional[int] = None, vocab_size: int = 32000, multiple_of: int = 256, ffn_dim_multiplier: Optional[int] = None, rms_norm_eps: float = 1e-05, bos_token_id: int = 1, eos_token_id: int = 2, pad_token_id: int = 0, ignore_token_id: int = -100, theta: float = 10000.0, compute_dtype: str = 'float16', layernorm_compute_type: str = 'float32', softmax_compute_type: str = 'float32', rotary_dtype: str = 'float32', param_init_type: str = 'float16', residual_dtype: str = None, embedding_init_type=None, qkv_has_bias: bool = False, qkv_concat: bool = False, attn_proj_has_bias: bool = False, parallel_config: Union[dict, mindformers.modules.transformer.transformer.TransformerOpParallelConfig] = <mindformers.modules.transformer.transformer.TransformerOpParallelConfig object>, moe_config: Union[dict, mindformers.modules.transformer.moe.MoEConfig] = <mindformers.modules.transformer.moe.MoEConfig object>, use_past: bool = False, extend_method: str = 'None', scaling_factor: float = 1.0, is_dynamic: bool = False, use_rope_slice: bool = False, use_flash_attention: bool = False, use_ring_attention: bool = False, use_attn_mask_compression: bool = False, use_eod_attn_mask_compression: bool = False, parallel_optimizer: bool = False, fine_grain_interleave: int = 1, pp_interleave_num: int = 1, offset: int = 0, init_method_std: float = 0.01, checkpoint_name_or_path: str = '', repetition_penalty: float = 1.0, max_decode_length: int = 1024, block_size: int = 16, num_blocks: int = 512, top_k: int = 5, top_p: float = 1.0, do_sample: bool = True, quant_config: dict = None, tie_word_embeddings: bool = False, llm_backend: str = '', fused_rms_norm: bool = True, input_sliced_sig: bool = False, rmsnorm_compute_2d: bool = False, chunk_prefill: bool = False, calculate_per_token_loss: bool = False, pipeline_stage: dict = None, **kwargs)"
"signature": "(batch_size: int = 1, seq_length: int = 2048, hidden_size: int = 4096, num_layers: int = 32, num_heads: int = 32, n_kv_heads: Optional[int] = None, max_position_embedding: Optional[int] = None, intermediate_size: Optional[int] = None, vocab_size: int = 32000, multiple_of: int = 256, ffn_dim_multiplier: Optional[int] = None, rms_norm_eps: float = 1e-05, bos_token_id: int = 1, eos_token_id: int = 2, pad_token_id: int = 0, ignore_token_id: int = -100, theta: float = 10000.0, compute_dtype: str = 'float16', layernorm_compute_type: str = 'float32', softmax_compute_type: str = 'float32', rotary_dtype: str = 'float32', param_init_type: str = 'float16', residual_dtype: str = None, embedding_init_type=None, qkv_has_bias: bool = False, qkv_concat: bool = False, attn_proj_has_bias: bool = False, parallel_config: Union[dict, mindformers.modules.transformer.transformer.TransformerOpParallelConfig] = <mindformers.modules.transformer.transformer.TransformerOpParallelConfig object>, moe_config: Union[dict, mindformers.modules.transformer.moe.MoEConfig] = <mindformers.modules.transformer.moe.MoEConfig object>, use_past: bool = False, extend_method: str = 'None', scaling_factor: float = 1.0, is_dynamic: bool = False, use_rope_slice: bool = False, use_flash_attention: bool = False, use_ring_attention: bool = False, use_attn_mask_compression: bool = False, use_eod_attn_mask_compression: bool = False, parallel_optimizer: bool = False, fine_grain_interleave: int = 1, pp_interleave_num: int = 1, offset: int = 0, init_method_std: float = 0.01, checkpoint_name_or_path: str = '', repetition_penalty: float = 1.0, max_decode_length: int = 1024, block_size: int = 16, num_blocks: int = 512, top_k: int = 5, top_p: float = 1.0, do_sample: bool = True, quant_config: dict = None, tie_word_embeddings: bool = False, llm_backend: str = '', fused_rms_norm: bool = True, input_sliced_sig: bool = False, rmsnorm_compute_2d: bool = False, chunk_prefill: bool = False, calculate_per_token_loss: bool = False, pipeline_stage: dict = None, return_hidden_states: bool = False, **kwargs)"
},
"mindformers.models.llama.LlamaForCausalLM": {
"signature": "(config: mindformers.models.llama.llama_config.LlamaConfig = None)"
@@ -5763,7 +5763,7 @@
"signature": "(self)"
},
"mindformers.models.llama.llama_config.LlamaConfig": {
"signature": "(batch_size: int = 1, seq_length: int = 2048, hidden_size: int = 4096, num_layers: int = 32, num_heads: int = 32, n_kv_heads: Optional[int] = None, max_position_embedding: Optional[int] = None, intermediate_size: Optional[int] = None, vocab_size: int = 32000, multiple_of: int = 256, ffn_dim_multiplier: Optional[int] = None, rms_norm_eps: float = 1e-05, bos_token_id: int = 1, eos_token_id: int = 2, pad_token_id: int = 0, ignore_token_id: int = -100, theta: float = 10000.0, compute_dtype: str = 'float16', layernorm_compute_type: str = 'float32', softmax_compute_type: str = 'float32', rotary_dtype: str = 'float32', param_init_type: str = 'float16', residual_dtype: str = None, embedding_init_type=None, qkv_has_bias: bool = False, qkv_concat: bool = False, attn_proj_has_bias: bool = False, parallel_config: Union[dict, mindformers.modules.transformer.transformer.TransformerOpParallelConfig] = <mindformers.modules.transformer.transformer.TransformerOpParallelConfig object>, moe_config: Union[dict, mindformers.modules.transformer.moe.MoEConfig] = <mindformers.modules.transformer.moe.MoEConfig object>, use_past: bool = False, extend_method: str = 'None', scaling_factor: float = 1.0, is_dynamic: bool = False, use_rope_slice: bool = False, use_flash_attention: bool = False, use_ring_attention: bool = False, use_attn_mask_compression: bool = False, use_eod_attn_mask_compression: bool = False, parallel_optimizer: bool = False, fine_grain_interleave: int = 1, pp_interleave_num: int = 1, offset: int = 0, init_method_std: float = 0.01, checkpoint_name_or_path: str = '', repetition_penalty: float = 1.0, max_decode_length: int = 1024, block_size: int = 16, num_blocks: int = 512, top_k: int = 5, top_p: float = 1.0, do_sample: bool = True, quant_config: dict = None, tie_word_embeddings: bool = False, llm_backend: str = '', fused_rms_norm: bool = True, input_sliced_sig: bool = False, rmsnorm_compute_2d: bool = False, chunk_prefill: bool = False, calculate_per_token_loss: bool = False, pipeline_stage: dict = None, **kwargs)"
"signature": "(batch_size: int = 1, seq_length: int = 2048, hidden_size: int = 4096, num_layers: int = 32, num_heads: int = 32, n_kv_heads: Optional[int] = None, max_position_embedding: Optional[int] = None, intermediate_size: Optional[int] = None, vocab_size: int = 32000, multiple_of: int = 256, ffn_dim_multiplier: Optional[int] = None, rms_norm_eps: float = 1e-05, bos_token_id: int = 1, eos_token_id: int = 2, pad_token_id: int = 0, ignore_token_id: int = -100, theta: float = 10000.0, compute_dtype: str = 'float16', layernorm_compute_type: str = 'float32', softmax_compute_type: str = 'float32', rotary_dtype: str = 'float32', param_init_type: str = 'float16', residual_dtype: str = None, embedding_init_type=None, qkv_has_bias: bool = False, qkv_concat: bool = False, attn_proj_has_bias: bool = False, parallel_config: Union[dict, mindformers.modules.transformer.transformer.TransformerOpParallelConfig] = <mindformers.modules.transformer.transformer.TransformerOpParallelConfig object>, moe_config: Union[dict, mindformers.modules.transformer.moe.MoEConfig] = <mindformers.modules.transformer.moe.MoEConfig object>, use_past: bool = False, extend_method: str = 'None', scaling_factor: float = 1.0, is_dynamic: bool = False, use_rope_slice: bool = False, use_flash_attention: bool = False, use_ring_attention: bool = False, use_attn_mask_compression: bool = False, use_eod_attn_mask_compression: bool = False, parallel_optimizer: bool = False, fine_grain_interleave: int = 1, pp_interleave_num: int = 1, offset: int = 0, init_method_std: float = 0.01, checkpoint_name_or_path: str = '', repetition_penalty: float = 1.0, max_decode_length: int = 1024, block_size: int = 16, num_blocks: int = 512, top_k: int = 5, top_p: float = 1.0, do_sample: bool = True, quant_config: dict = None, tie_word_embeddings: bool = False, llm_backend: str = '', fused_rms_norm: bool = True, input_sliced_sig: bool = False, rmsnorm_compute_2d: bool = False, chunk_prefill: bool = False, calculate_per_token_loss: bool = False, pipeline_stage: dict = None, return_hidden_states: bool = False, **kwargs)"
},
"mindformers.models.llama.llama_processor.LlamaProcessor": {
"signature": "(tokenizer=None, max_length=128, padding='max_length', return_tensors='ms')"