mirror of
https://gitee.com/mindspore/mindformers.git
synced 2025-12-06 11:29:59 +08:00
deepseek code sync from os branch
This commit is contained in:
@@ -221,16 +221,26 @@ class MFContextOperator(MFContextConfig):
|
||||
return cls._instance
|
||||
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
supported_kwargs = self._handle_data()
|
||||
logger.debug('MFContextConfig load configs: %s', supported_kwargs)
|
||||
super(MFContextOperator, self).__init__(**supported_kwargs)
|
||||
use_past = self.config.get_value('model.model_config.use_past', False)
|
||||
if not hasattr(self, 'train_precision_sync'):
|
||||
self.train_precision_sync = None
|
||||
if not hasattr(self, 'infer_precision_sync'):
|
||||
self.infer_precision_sync = None
|
||||
self.set_env(use_past)
|
||||
if not hasattr(self, '_initailed'):
|
||||
self.config = config
|
||||
supported_kwargs = self._handle_data()
|
||||
logger.debug('MFContextConfig load configs: %s', supported_kwargs)
|
||||
super(MFContextOperator, self).__init__(**supported_kwargs)
|
||||
use_past = self.config.get_value('model.model_config.use_past',
|
||||
False)
|
||||
if not hasattr(self, 'train_precision_sync'):
|
||||
self.train_precision_sync = None
|
||||
if not hasattr(self, 'infer_precision_sync'):
|
||||
self.infer_precision_sync = None
|
||||
self.set_env(use_past)
|
||||
self._initailed = True
|
||||
|
||||
@classmethod
|
||||
def get_mf_ctx_instance(cls):
|
||||
"""Check if singleton Context exists."""
|
||||
if cls._instance:
|
||||
return cls.__new__(cls)
|
||||
return None
|
||||
|
||||
def _handle_data(self):
|
||||
ctx_config = self.config.get('context', {})
|
||||
|
||||
@@ -28,6 +28,9 @@ from mindformers.experimental.infer.core.layers import ColumnParallelLinear, Row
|
||||
from mindformers.experimental.infer.core.mapping import ReduceFromModelParallelRegion
|
||||
|
||||
# pylint: disable=C0412
|
||||
from mindformers.experimental.infer.core.utils import get_tp_world_size
|
||||
from mindformers.tools.utils import divide
|
||||
|
||||
try:
|
||||
from mindspore.ops.auto_generate import (MoeComputeExpertTokens,
|
||||
MoeFinalizeRouting,
|
||||
@@ -230,23 +233,33 @@ class SharedParallelMLP(nn.Cell):
|
||||
self.hidden_size = self.config.hidden_size
|
||||
self.ffn_hidden_size = intermediate_size
|
||||
self.mlp_has_gate = self.config.mlp_has_gate
|
||||
|
||||
self.w1 = Linear(
|
||||
self.hidden_size,
|
||||
self.ffn_hidden_size,
|
||||
has_bias=self.has_bias,
|
||||
transpose_b=True,
|
||||
param_init_type=self.config.param_init_dtype,
|
||||
compute_dtype=self.config.compute_dtype,
|
||||
)
|
||||
self.w3 = Linear(
|
||||
self.hidden_size,
|
||||
self.ffn_hidden_size,
|
||||
has_bias=self.has_bias,
|
||||
transpose_b=True,
|
||||
param_init_type=self.config.param_init_dtype,
|
||||
compute_dtype=self.config.compute_dtype,
|
||||
)
|
||||
self.ffn_concat = self.config.ffn_concat
|
||||
if self.ffn_concat:
|
||||
self.w_gate_hidden = Linear(
|
||||
self.hidden_size,
|
||||
self.ffn_hidden_size * 2,
|
||||
has_bias=self.has_bias,
|
||||
transpose_b=True,
|
||||
param_init_type=self.config.param_init_dtype,
|
||||
compute_dtype=self.config.compute_dtype,
|
||||
)
|
||||
else:
|
||||
self.w1 = Linear(
|
||||
self.hidden_size,
|
||||
self.ffn_hidden_size,
|
||||
has_bias=self.has_bias,
|
||||
transpose_b=True,
|
||||
param_init_type=self.config.param_init_dtype,
|
||||
compute_dtype=self.config.compute_dtype,
|
||||
)
|
||||
self.w3 = Linear(
|
||||
self.hidden_size,
|
||||
self.ffn_hidden_size,
|
||||
has_bias=self.has_bias,
|
||||
transpose_b=True,
|
||||
param_init_type=self.config.param_init_dtype,
|
||||
compute_dtype=self.config.compute_dtype,
|
||||
)
|
||||
|
||||
self.act_type = self.config.hidden_act
|
||||
self.act_func = get_act_func(self.act_type)
|
||||
@@ -265,8 +278,13 @@ class SharedParallelMLP(nn.Cell):
|
||||
|
||||
def construct(self, x):
|
||||
""" Construct function of mlp block. """
|
||||
gate = self.w1(x)
|
||||
hidden = self.w3(x)
|
||||
if self.ffn_concat:
|
||||
gate_hidden_out = self.w_gate_hidden(x) # dp,1 -> dp, mp # dp,1 -> dp, mp
|
||||
gate, hidden = mint.split(gate_hidden_out,
|
||||
(self.ffn_hidden_size, self.ffn_hidden_size), -1)
|
||||
else:
|
||||
gate = self.w1(x)
|
||||
hidden = self.w3(x)
|
||||
gate = self.act_func(gate)
|
||||
hidden = mint.mul(hidden, gate)
|
||||
output = self.w2(hidden)
|
||||
@@ -294,20 +312,48 @@ class RoutedParallelMLP(nn.Cell):
|
||||
self.cast = P.Cast()
|
||||
self.act_type = self.config.hidden_act
|
||||
self.act_func = get_act_func(self.act_type)
|
||||
self.ffn_concat = self.config.ffn_concat
|
||||
tp_group_size = get_tp_world_size()
|
||||
self.ffn_hidden_size_per_partition = divide(self.ffn_hidden_size, tp_group_size)
|
||||
if self.ffn_concat:
|
||||
self.w_gate_hidden = ColumnParallelLinear(
|
||||
self.hidden_size,
|
||||
self.ffn_hidden_size * 2,
|
||||
config=self.config.parallel_config,
|
||||
bias=self.has_bias,
|
||||
transpose_b=True,
|
||||
gather_output=False,
|
||||
param_init_type=self.config.param_init_dtype,
|
||||
compute_dtype=self.config.compute_dtype,
|
||||
is_expert=True,
|
||||
expert_num=self.config.moe_config.expert_num,
|
||||
)
|
||||
else:
|
||||
self.w1 = ColumnParallelLinear(
|
||||
self.hidden_size,
|
||||
self.ffn_hidden_size,
|
||||
config=self.config.parallel_config,
|
||||
bias=self.has_bias,
|
||||
transpose_b=True,
|
||||
gather_output=False,
|
||||
param_init_type=self.config.param_init_dtype,
|
||||
compute_dtype=self.config.compute_dtype,
|
||||
is_expert=True,
|
||||
expert_num=self.config.moe_config.expert_num,
|
||||
)
|
||||
|
||||
self.w1 = ColumnParallelLinear(
|
||||
self.hidden_size,
|
||||
self.ffn_hidden_size,
|
||||
config=self.config.parallel_config,
|
||||
bias=self.has_bias,
|
||||
transpose_b=True,
|
||||
gather_output=False,
|
||||
param_init_type=self.config.param_init_dtype,
|
||||
compute_dtype=self.config.compute_dtype,
|
||||
is_expert=True,
|
||||
expert_num=self.config.moe_config.expert_num,
|
||||
)
|
||||
|
||||
self.w3 = ColumnParallelLinear(
|
||||
self.hidden_size,
|
||||
self.ffn_hidden_size,
|
||||
config=self.config.parallel_config,
|
||||
bias=self.has_bias,
|
||||
transpose_b=True,
|
||||
gather_output=False,
|
||||
param_init_type=self.config.param_init_dtype,
|
||||
compute_dtype=self.config.compute_dtype,
|
||||
is_expert=True,
|
||||
expert_num=self.config.moe_config.expert_num,
|
||||
)
|
||||
self.w2 = RowParallelLinear(
|
||||
self.ffn_hidden_size,
|
||||
self.hidden_size,
|
||||
@@ -323,27 +369,18 @@ class RoutedParallelMLP(nn.Cell):
|
||||
moe_delay_allreduce=True,
|
||||
)
|
||||
|
||||
self.w3 = ColumnParallelLinear(
|
||||
self.hidden_size,
|
||||
self.ffn_hidden_size,
|
||||
config=self.config.parallel_config,
|
||||
bias=self.has_bias,
|
||||
transpose_b=True,
|
||||
gather_output=False,
|
||||
param_init_type=self.config.param_init_dtype,
|
||||
compute_dtype=self.config.compute_dtype,
|
||||
is_expert=True,
|
||||
expert_num=self.config.moe_config.expert_num,
|
||||
)
|
||||
|
||||
def construct(self, x, group_list=None):
|
||||
"""Forward process of the FeedForward"""
|
||||
x = self.cast(x, self.config.compute_dtype)
|
||||
gate = self.w1(x, group_list=group_list)
|
||||
if self.ffn_concat:
|
||||
gate_hidden_out = self.w_gate_hidden(x, group_list=group_list) # dp,1 -> dp, mp # dp,1 -> dp, mp
|
||||
gate, hidden = mint.split(gate_hidden_out,
|
||||
(self.ffn_hidden_size_per_partition, self.ffn_hidden_size_per_partition), -1)
|
||||
else:
|
||||
gate = self.w1(x, group_list=group_list)
|
||||
hidden = self.w3(x, group_list=group_list)
|
||||
gate = self.act_func(gate)
|
||||
hidden = self.w3(x, group_list=group_list)
|
||||
hidden = mint.mul(hidden, gate)
|
||||
output = self.w2(hidden, group_list=group_list)
|
||||
output = self.w2(hidden, group_list)
|
||||
return output
|
||||
|
||||
|
||||
|
||||
@@ -18,10 +18,10 @@ import math
|
||||
|
||||
import mindspore.common.dtype as mstype
|
||||
from mindspore import ops as P
|
||||
from mindspore import nn
|
||||
from mindspore import nn, Parameter
|
||||
from mindspore.common.initializer import initializer
|
||||
from mindformers.experimental.infer.core.utils import create_empty_parameter
|
||||
|
||||
|
||||
class ParallelPagedAttentionMgr(nn.Cell):
|
||||
"""Paged Attention Manager."""
|
||||
def __init__(self,
|
||||
@@ -59,6 +59,12 @@ class ParallelPagedAttentionMgr(nn.Cell):
|
||||
name="value_cache",
|
||||
requires_grad=False,
|
||||
)
|
||||
elif self.npu_mem_size == 0:
|
||||
self.key_cache = Parameter(initializer('zeros', kv_shape, compute_dtype), name="key_cache",
|
||||
requires_grad=False)
|
||||
self.value_cache = Parameter(initializer('zeros', kv_shape, compute_dtype), name="value_cache",
|
||||
requires_grad=False)
|
||||
|
||||
self.reshape_and_cache = P.auto_generate.ReshapeAndCache()
|
||||
self.paged_attention = P.auto_generate.PagedAttention(self.n_heads,
|
||||
self.scale_value,
|
||||
|
||||
@@ -112,6 +112,7 @@ class LlamaConfig(PretrainedConfig):
|
||||
calculate_per_token_loss (bool, optional): Whether to calculate the loss of each token. Default: ``False``.
|
||||
pipeline_stage (dict, optional): A dict set the start_stage, stage_num, and offset of the model when
|
||||
pipeline parallelism. Default: ``None``.
|
||||
return_hidden_states (bool, optional): Whether to return hidden states. Default: ``False``.
|
||||
|
||||
Returns:
|
||||
LlamaConfig, a LlamaConfig instance.
|
||||
@@ -191,6 +192,7 @@ class LlamaConfig(PretrainedConfig):
|
||||
chunk_prefill: bool = False,
|
||||
calculate_per_token_loss: bool = False,
|
||||
pipeline_stage: dict = None,
|
||||
return_hidden_states: bool = False,
|
||||
**kwargs):
|
||||
"""
|
||||
Note:
|
||||
@@ -267,6 +269,7 @@ class LlamaConfig(PretrainedConfig):
|
||||
self.input_sliced_sig = input_sliced_sig
|
||||
self.rmsnorm_compute_2d = rmsnorm_compute_2d
|
||||
self.chunk_prefill = chunk_prefill
|
||||
self.return_hidden_states = return_hidden_states
|
||||
self.calculate_per_token_loss = calculate_per_token_loss
|
||||
if (pipeline_stage is not None and
|
||||
pipeline_stage["start_stage"] + pipeline_stage["stage_num"] <= parallel_config.pipeline_stage):
|
||||
|
||||
@@ -28,16 +28,16 @@ from mindformers.version_control import synchronize
|
||||
def get_profile_settings():
|
||||
"""Get profile settings for context."""
|
||||
context_module = import_module("mindformers.core.context.build_context")
|
||||
context_instance = context_module.Context()
|
||||
mf_ctx_instance = context_module.MFContextOperator.get_mf_ctx_instance()
|
||||
profile = False
|
||||
profile_start_step = 0
|
||||
profile_stop_step = 0
|
||||
profile_level = 1
|
||||
if context_instance is not None and context_instance.is_exists():
|
||||
profile = context_module.get_context("profile")
|
||||
profile_start_step = context_module.get_context("profile_start_step")
|
||||
profile_stop_step = context_module.get_context("profile_stop_step")
|
||||
profile_level = context_module.get_context("profile_level") or profile_level
|
||||
if mf_ctx_instance is not None:
|
||||
profile = getattr(mf_ctx_instance, 'profile', profile)
|
||||
profile_start_step = getattr(mf_ctx_instance, 'profile_start_step', profile_start_step)
|
||||
profile_stop_step = getattr(mf_ctx_instance, 'profile_stop_step', profile_stop_step)
|
||||
profile_level = getattr(mf_ctx_instance, 'profile_level', profile_level)
|
||||
max_level = len(ProfilerLevel.__members__) - 1
|
||||
if profile_level < 0 or profile_level > max_level:
|
||||
logger.warning("Invalid profile level: %s, return level1.", profile_level)
|
||||
|
||||
@@ -51,7 +51,7 @@ from mindformers.modules.transformer.moe import MoEV2
|
||||
from mindformers.modules.transformer.moe import MoEInfer
|
||||
from mindformers.version_control import check_seqpp_fa_opt_support
|
||||
|
||||
from deepseek2_config import DeepseekV2Config
|
||||
from research.deepseek3.deepseek2_config import DeepseekV2Config
|
||||
|
||||
__all__ = ['DeepseekV2ForCausalLM', 'DeepseekV2Model']
|
||||
|
||||
|
||||
@@ -17,8 +17,8 @@ import os
|
||||
|
||||
from mindformers.tools.register import MindFormerRegister, MindFormerModuleType
|
||||
|
||||
from deepseek3_model_train import TrainingDeepseekV3ForCausalLM
|
||||
from deepseek3_model_infer import InferenceDeepseekV3ForCausalLM
|
||||
from research.deepseek3.deepseek3_model_train import TrainingDeepseekV3ForCausalLM
|
||||
from research.deepseek3.deepseek3_model_infer import InferenceDeepseekV3ForCausalLM
|
||||
|
||||
__all__ = ['DeepseekV3ForCausalLM']
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distr
|
||||
# trainer config
|
||||
trainer:
|
||||
type: CausalLanguageModelingTrainer
|
||||
model_name: 'DeepseekV3'
|
||||
model_name: 'DeepSeekV3'
|
||||
|
||||
# default parallel of device num = 32 for Atlas 800T A2
|
||||
parallel_config:
|
||||
@@ -22,8 +22,13 @@ parallel_config:
|
||||
# mindspore context init config
|
||||
context:
|
||||
mode: 0 # 0--Graph Mode; 1--Pynative Mode
|
||||
max_device_memory: "58GB"
|
||||
max_device_memory: "61GB"
|
||||
device_id: 0
|
||||
affinity_cpu_list: None
|
||||
|
||||
kernel_launch_group:
|
||||
thread_num: 4
|
||||
kernel_group_num: 16
|
||||
|
||||
# parallel context config
|
||||
parallel:
|
||||
@@ -83,6 +88,7 @@ model:
|
||||
do_sample: False
|
||||
is_dynamic: True
|
||||
qkv_concat: False
|
||||
ffn_concat: False
|
||||
auto_map:
|
||||
AutoConfig: deepseek3_config.DeepseekV3Config
|
||||
AutoModel: deepseek3.DeepseekV3ForCausalLM
|
||||
|
||||
@@ -4,17 +4,17 @@ run_mode: 'predict'
|
||||
use_parallel: True
|
||||
|
||||
load_checkpoint: "/path/to/deepseekv3/model_w8a8_ckpt"
|
||||
load_ckpt_format: "ckpt"
|
||||
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
|
||||
load_ckpt_format: "safetensors"
|
||||
auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distributed model
|
||||
|
||||
# trainer config
|
||||
trainer:
|
||||
type: CausalLanguageModelingTrainer
|
||||
model_name: 'DeepseekV3-W8A8'
|
||||
model_name: 'DeepSeekV3-W8A8'
|
||||
|
||||
# default parallel of device num = 32 for Atlas 800T A2
|
||||
# default parallel of device num = 16 for Atlas 800T A2
|
||||
parallel_config:
|
||||
model_parallel: 32
|
||||
model_parallel: 16
|
||||
pipeline_stage: 1
|
||||
expert_parallel: 1
|
||||
vocab_emb_dp: False
|
||||
@@ -22,8 +22,13 @@ parallel_config:
|
||||
# mindspore context init config
|
||||
context:
|
||||
mode: 0 # 0--Graph Mode; 1--Pynative Mode
|
||||
max_device_memory: "58GB"
|
||||
max_device_memory: "61GB"
|
||||
device_id: 0
|
||||
affinity_cpu_list: None
|
||||
|
||||
kernel_launch_group:
|
||||
thread_num: 4
|
||||
kernel_group_num: 16
|
||||
|
||||
# parallel context config
|
||||
parallel:
|
||||
@@ -83,6 +88,7 @@ model:
|
||||
do_sample: False
|
||||
is_dynamic: True
|
||||
qkv_concat: False
|
||||
ffn_concat: False
|
||||
quantization_config:
|
||||
quant_method: 'ptq'
|
||||
weight_dtype: 'int8'
|
||||
|
||||
@@ -23,7 +23,7 @@ from mindformers.modules.transformer.transformer import default_transformer_conf
|
||||
from mindformers.models.utils import convert_mstype
|
||||
from mindformers.tools.register import MindFormerRegister, MindFormerModuleType
|
||||
from mindformers.mindformer_book import MindFormerBook
|
||||
from deepseek2_config import DeepseekV2Config
|
||||
from research.deepseek3.deepseek2_config import DeepseekV2Config
|
||||
|
||||
__all__ = ['DeepseekV3Config']
|
||||
|
||||
@@ -135,6 +135,7 @@ class DeepseekV3Config(DeepseekV2Config):
|
||||
init_method_std=0.006,
|
||||
qkv_has_bias=False,
|
||||
qkv_concat=False,
|
||||
ffn_concat=False,
|
||||
parallel_config: Union[dict, TransformerOpParallelConfig] = default_transformer_config,
|
||||
moe_config: Union[dict, MoEConfig] = default_moe_config,
|
||||
use_past: bool = False,
|
||||
@@ -160,6 +161,7 @@ class DeepseekV3Config(DeepseekV2Config):
|
||||
use_fused_rope=False,
|
||||
use_fused_swiglu=False,
|
||||
enable_fa_var_len=False,
|
||||
return_hidden_states=False,
|
||||
**kwargs):
|
||||
super(DeepseekV3Config, self).__init__(**kwargs)
|
||||
if isinstance(parallel_config, dict):
|
||||
@@ -187,6 +189,7 @@ class DeepseekV3Config(DeepseekV2Config):
|
||||
self.ffn_dim_multiplier = ffn_dim_multiplier
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.qkv_concat = qkv_concat
|
||||
self.ffn_concat = ffn_concat
|
||||
self.param_init_type = convert_mstype(param_init_type)
|
||||
self.qkv_has_bias = qkv_has_bias
|
||||
self.layernorm_compute_type = convert_mstype(layernorm_compute_type)
|
||||
@@ -226,3 +229,4 @@ class DeepseekV3Config(DeepseekV2Config):
|
||||
self.use_fused_rope = use_fused_rope
|
||||
self.use_fused_swiglu = use_fused_swiglu
|
||||
self.enable_fa_var_len = enable_fa_var_len
|
||||
self.return_hidden_states = return_hidden_states
|
||||
|
||||
@@ -24,6 +24,7 @@ from mindspore.ops import operations as P
|
||||
from mindspore.nn.cell import Cell
|
||||
from mindspore.common.initializer import Zero
|
||||
from mindspore.communication._comm_helper import _is_initialized
|
||||
|
||||
try:
|
||||
from mindspore._checkparam import Validator
|
||||
except ImportError:
|
||||
@@ -45,8 +46,8 @@ from mindformers.experimental.infer.core.norm import RMSNorm
|
||||
from mindformers.experimental.infer.core.moe import RoutedParallelMLP, SharedParallelMLP, ParallelMoEV2
|
||||
from mindformers.experimental.infer.core.transformer import ParallelMLP, VocabEmbedding
|
||||
|
||||
from deepseek3_config import DeepseekV3Config
|
||||
from utils import convert_model_config
|
||||
from research.deepseek3.deepseek3_config import DeepseekV3Config
|
||||
from research.deepseek3.utils import convert_model_config
|
||||
|
||||
__all__ = ['InferenceDeepseekV3ForCausalLM', 'DeepseekV3Model']
|
||||
|
||||
@@ -420,7 +421,6 @@ class DeepseekV3Attention(nn.Cell):
|
||||
.format(self.n_kv_head, parallel_config.model_parallel))
|
||||
self.shape = P.Shape()
|
||||
self.cast = P.Cast()
|
||||
|
||||
if self.q_lora_rank == 0:
|
||||
self.q_proj = ColumnParallelLinear(
|
||||
self.hidden_size,
|
||||
@@ -609,6 +609,46 @@ class DeepseekV3Attention(nn.Cell):
|
||||
return output
|
||||
|
||||
|
||||
class DeepseekV3ParallelMLP(ParallelMLP):
|
||||
r"""
|
||||
Implementation of parallel feedforward block.
|
||||
|
||||
Args:
|
||||
config (dict): Configuration.
|
||||
is_expert (book): This block is an expert block. Default: False.
|
||||
|
||||
Inputs:
|
||||
- **hidden_states** (Tensor) - Tensor of shape :math:`(B, S, H)`.
|
||||
|
||||
Outputs:
|
||||
- **output** (Tensor) - Output tensor of shape :math:`(B, S, H)`.
|
||||
|
||||
Supported Platforms:
|
||||
``Ascend``
|
||||
"""
|
||||
def __init__(self, config, is_expert=False):
|
||||
super().__init__(config)
|
||||
if is_expert:
|
||||
raise NotImplementedError("For ParallelMLP, `is_expert` is not supported for now.")
|
||||
|
||||
def construct(self, x):
|
||||
""" Construct function of mlp block. """
|
||||
# [B, S, H] -> [B, S, ffn_H]
|
||||
if self.ffn_concat:
|
||||
gate_hidden_out = self.w_gate_hidden(x) # dp,1 -> dp, mp # dp,1 -> dp, mp
|
||||
gate, hidden = mint.split(gate_hidden_out,
|
||||
(self.ffn_hidden_size_per_partition, self.ffn_hidden_size_per_partition), -1)
|
||||
else:
|
||||
gate = self.w1(x) # dp,1 -> dp, mp
|
||||
hidden = self.w3(x) # dp,1 -> dp, mp
|
||||
gate = self.act_func(gate)
|
||||
hidden = mint.mul(hidden, gate)
|
||||
|
||||
# [B, S, ffn_H] -> [B, S, H]
|
||||
output = self.w2(hidden)
|
||||
return output
|
||||
|
||||
|
||||
class DeepseekV3MoE(Cell):
|
||||
r"""
|
||||
This is an implementation of self-attention mechanism in DeepSeek-V3.
|
||||
@@ -795,7 +835,7 @@ class DeepseekV3DecodeLayer(nn.Cell):
|
||||
if self.first_k_dense:
|
||||
logger.warning("first_k_dense_replace is provided in MoEConfig, "
|
||||
"a normal dense FFN will be used in this block.")
|
||||
self.feed_forward = ParallelMLP(config)
|
||||
self.feed_forward = DeepseekV3ParallelMLP(config)
|
||||
else:
|
||||
self.feed_forward = DeepseekV3MoE(config=config)
|
||||
|
||||
@@ -1069,21 +1109,32 @@ class InferenceDeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):
|
||||
self.gather = P.Gather()
|
||||
self.sub_batch_valid_len = P.Sub()
|
||||
self.model = DeepseekV3Model(config=config)
|
||||
self.lm_head = ColumnParallelLinear(
|
||||
config.hidden_size,
|
||||
config.vocab_size,
|
||||
config=config.parallel_config,
|
||||
bias=False,
|
||||
param_init_type=config.param_init_type,
|
||||
compute_dtype=config.compute_dtype,
|
||||
weight_init="normal",
|
||||
gather_output=True
|
||||
)
|
||||
if config.parallel_config.vocab_emb_dp:
|
||||
self.lm_head = Linear(
|
||||
in_channels=config.hidden_size,
|
||||
out_channels=config.vocab_size,
|
||||
weight_init="normal",
|
||||
has_bias=False,
|
||||
param_init_type=config.param_init_type,
|
||||
compute_dtype=config.compute_dtype
|
||||
)
|
||||
else:
|
||||
self.lm_head = ColumnParallelLinear(
|
||||
config.hidden_size,
|
||||
config.vocab_size,
|
||||
config=config.parallel_config,
|
||||
bias=False,
|
||||
param_init_type=config.param_init_type,
|
||||
compute_dtype=config.compute_dtype,
|
||||
weight_init="normal",
|
||||
gather_output=True
|
||||
)
|
||||
self.prefill_gather_flatten = P.Gather()
|
||||
|
||||
self.load_checkpoint(config)
|
||||
self.predict_run_mode = get_predict_run_mode()
|
||||
logger.info("Predict run mode:{}".format(self.predict_run_mode))
|
||||
self.return_hidden_states = config.return_hidden_states
|
||||
|
||||
# pylint: disable=W0613
|
||||
def prepare_inputs_for_predict_layout(self, input_ids, **kwargs):
|
||||
@@ -1133,6 +1184,9 @@ class InferenceDeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):
|
||||
batch_valid_length = self.reshape(batch_valid_length, (-1,))
|
||||
output = self.model(tokens, batch_valid_length, batch_index, zactivate_len, block_tables,
|
||||
slot_mapping)
|
||||
if self.return_hidden_states:
|
||||
output = self.reshape(output, (-1, output.shape[-1]))
|
||||
return output
|
||||
pre_gather = (not self.use_past or self.is_first_iteration) and batch_valid_length is not None
|
||||
output = self.pre_gather_func(pre_gather, output, batch_valid_length)
|
||||
logits = self.lm_head(output)
|
||||
@@ -1147,3 +1201,8 @@ class InferenceDeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):
|
||||
logits = self.reshape(logits, (-1, logits.shape[-1]))
|
||||
return logits
|
||||
return logits, tokens, input_mask
|
||||
|
||||
def kvcache(self, layer_idx):
|
||||
"""Get the key_cache depend on layer_idx."""
|
||||
key_cache = self.model.layers[layer_idx].attention.infer_attention.paged_attention_mgr.key_cache
|
||||
return key_cache, None
|
||||
|
||||
@@ -17,7 +17,7 @@ import mindspore as ms
|
||||
from mindspore.common import dtype as mstype
|
||||
from mindspore.ops import operations as P
|
||||
|
||||
from deepseek2_model import DeepseekV2ForCausalLM
|
||||
from research.deepseek3.deepseek2_model import DeepseekV2ForCausalLM
|
||||
|
||||
|
||||
class TrainingDeepseekV3ForCausalLM(DeepseekV2ForCausalLM):
|
||||
|
||||
@@ -0,0 +1,121 @@
|
||||
seed: 0
|
||||
output_dir: './output' # path to save checkpoint/strategy
|
||||
run_mode: 'predict'
|
||||
use_parallel: True
|
||||
|
||||
load_checkpoint: "/path/to/deepseekr1/model_ckpt"
|
||||
load_ckpt_format: "safetensors"
|
||||
auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distributed model
|
||||
|
||||
# trainer config
|
||||
trainer:
|
||||
type: CausalLanguageModelingTrainer
|
||||
model_name: 'DeepSeekR1'
|
||||
|
||||
# default parallel of device num = 32 for Atlas 800T A2
|
||||
parallel_config:
|
||||
model_parallel: 32
|
||||
pipeline_stage: 1
|
||||
expert_parallel: 1
|
||||
vocab_emb_dp: False
|
||||
|
||||
# mindspore context init config
|
||||
context:
|
||||
mode: 0 # 0--Graph Mode; 1--Pynative Mode
|
||||
max_device_memory: "61GB"
|
||||
device_id: 0
|
||||
affinity_cpu_list: None
|
||||
|
||||
kernel_launch_group:
|
||||
thread_num: 4
|
||||
kernel_group_num: 16
|
||||
|
||||
# parallel context config
|
||||
parallel:
|
||||
parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend
|
||||
full_batch: False
|
||||
strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
|
||||
|
||||
# model config
|
||||
model:
|
||||
model_config:
|
||||
type: DeepseekV3Config
|
||||
auto_register: deepseek3_config.DeepseekV3Config
|
||||
batch_size: 1 # add for incre predict
|
||||
seq_length: 4096
|
||||
hidden_size: 7168
|
||||
num_layers: 61
|
||||
num_heads: 128
|
||||
max_position_embeddings: 163840
|
||||
intermediate_size: 18432
|
||||
kv_lora_rank: 512
|
||||
q_lora_rank: 1536
|
||||
qk_rope_head_dim: 64
|
||||
v_head_dim: 128
|
||||
qk_nope_head_dim: 128
|
||||
vocab_size: 129280
|
||||
multiple_of: 256
|
||||
rms_norm_eps: 1.0e-6
|
||||
bos_token_id: 0
|
||||
eos_token_id: 1
|
||||
pad_token_id: 1
|
||||
ignore_token_id: -100
|
||||
compute_dtype: "bfloat16"
|
||||
layernorm_compute_type: "bfloat16"
|
||||
softmax_compute_type: "bfloat16"
|
||||
rotary_dtype: "bfloat16"
|
||||
router_dense_type: "bfloat16"
|
||||
param_init_type: "bfloat16"
|
||||
scaling_factor:
|
||||
beta_fast: 32.0
|
||||
beta_slow: 1.0
|
||||
factor: 40.0
|
||||
mscale: 1.0
|
||||
mscale_all_dim: 1.0
|
||||
original_max_position_embeddings: 4096
|
||||
use_past: True
|
||||
extend_method: "YARN"
|
||||
use_flash_attention: True
|
||||
block_size: 16
|
||||
num_blocks: 512
|
||||
offset: 0
|
||||
checkpoint_name_or_path: ""
|
||||
repetition_penalty: 1
|
||||
max_decode_length: 1024
|
||||
top_k: 1
|
||||
top_p: 1
|
||||
theta: 10000.0
|
||||
do_sample: False
|
||||
is_dynamic: True
|
||||
qkv_concat: False
|
||||
ffn_concat: False
|
||||
auto_map:
|
||||
AutoConfig: deepseek3_config.DeepseekV3Config
|
||||
AutoModel: deepseek3.DeepseekV3ForCausalLM
|
||||
arch:
|
||||
type: DeepseekV3ForCausalLM
|
||||
auto_register: deepseek3.DeepseekV3ForCausalLM
|
||||
|
||||
moe_config:
|
||||
expert_num: 256
|
||||
num_experts_chosen: 8
|
||||
routing_policy: "TopkRouterV2"
|
||||
shared_expert_num: 1
|
||||
routed_scaling_factor: 2.5
|
||||
first_k_dense_replace: 3
|
||||
moe_intermediate_size: 2048
|
||||
topk_group: 4
|
||||
n_group: 8
|
||||
|
||||
processor:
|
||||
return_tensors: ms
|
||||
tokenizer:
|
||||
unk_token: '<unk>'
|
||||
bos_token: '<|begin▁of▁sentence|>'
|
||||
eos_token: '<|end▁of▁sentence|>'
|
||||
pad_token: '<|end▁of▁sentence|>'
|
||||
type: LlamaTokenizerFast
|
||||
vocab_file: '/path/to/deepseekr1/tokenizer.json'
|
||||
tokenizer_file: '/path/to/deepseekr1/tokenizer.json'
|
||||
chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}"
|
||||
type: LlamaProcessor
|
||||
@@ -0,0 +1,125 @@
|
||||
seed: 0
|
||||
output_dir: './output' # path to save checkpoint/strategy
|
||||
run_mode: 'predict'
|
||||
use_parallel: True
|
||||
|
||||
load_checkpoint: "/path/to/deepseekr1/model_w8a8_ckpt"
|
||||
load_ckpt_format: "safetensors"
|
||||
auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distributed model
|
||||
|
||||
# trainer config
|
||||
trainer:
|
||||
type: CausalLanguageModelingTrainer
|
||||
model_name: 'DeepSeekR1-W8A8'
|
||||
|
||||
# default parallel of device num = 16 for Atlas 800T A2
|
||||
parallel_config:
|
||||
model_parallel: 16
|
||||
pipeline_stage: 1
|
||||
expert_parallel: 1
|
||||
vocab_emb_dp: False
|
||||
|
||||
# mindspore context init config
|
||||
context:
|
||||
mode: 0 # 0--Graph Mode; 1--Pynative Mode
|
||||
max_device_memory: "61GB"
|
||||
device_id: 0
|
||||
affinity_cpu_list: None
|
||||
|
||||
kernel_launch_group:
|
||||
thread_num: 4
|
||||
kernel_group_num: 16
|
||||
|
||||
# parallel context config
|
||||
parallel:
|
||||
parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend
|
||||
full_batch: False
|
||||
strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
|
||||
|
||||
# model config
|
||||
model:
|
||||
model_config:
|
||||
type: DeepseekV3Config
|
||||
auto_register: deepseek3_config.DeepseekV3Config
|
||||
batch_size: 1 # add for incre predict
|
||||
seq_length: 4096
|
||||
hidden_size: 7168
|
||||
num_layers: 61
|
||||
num_heads: 128
|
||||
max_position_embeddings: 163840
|
||||
intermediate_size: 18432
|
||||
kv_lora_rank: 512
|
||||
q_lora_rank: 1536
|
||||
qk_rope_head_dim: 64
|
||||
v_head_dim: 128
|
||||
qk_nope_head_dim: 128
|
||||
vocab_size: 129280
|
||||
multiple_of: 256
|
||||
rms_norm_eps: 1.0e-6
|
||||
bos_token_id: 0
|
||||
eos_token_id: 1
|
||||
pad_token_id: 1
|
||||
ignore_token_id: -100
|
||||
compute_dtype: "bfloat16"
|
||||
layernorm_compute_type: "bfloat16"
|
||||
softmax_compute_type: "bfloat16"
|
||||
rotary_dtype: "bfloat16"
|
||||
router_dense_type: "bfloat16"
|
||||
param_init_type: "bfloat16"
|
||||
scaling_factor:
|
||||
beta_fast: 32.0
|
||||
beta_slow: 1.0
|
||||
factor: 40.0
|
||||
mscale: 1.0
|
||||
mscale_all_dim: 1.0
|
||||
original_max_position_embeddings: 4096
|
||||
use_past: True
|
||||
extend_method: "YARN"
|
||||
use_flash_attention: True
|
||||
block_size: 16
|
||||
num_blocks: 512
|
||||
offset: 0
|
||||
checkpoint_name_or_path: ""
|
||||
repetition_penalty: 1
|
||||
max_decode_length: 1024
|
||||
top_k: 1
|
||||
top_p: 1
|
||||
theta: 10000.0
|
||||
do_sample: False
|
||||
is_dynamic: True
|
||||
qkv_concat: False
|
||||
ffn_concat: False
|
||||
quantization_config:
|
||||
quant_method: 'ptq'
|
||||
weight_dtype: 'int8'
|
||||
activation_dtype: 'int8'
|
||||
auto_map:
|
||||
AutoConfig: deepseek3_config.DeepseekV3Config
|
||||
AutoModel: deepseek3.DeepseekV3ForCausalLM
|
||||
arch:
|
||||
type: DeepseekV3ForCausalLM
|
||||
auto_register: deepseek3.DeepseekV3ForCausalLM
|
||||
|
||||
moe_config:
|
||||
expert_num: 256
|
||||
num_experts_chosen: 8
|
||||
routing_policy: "TopkRouterV2"
|
||||
shared_expert_num: 1
|
||||
routed_scaling_factor: 2.5
|
||||
first_k_dense_replace: 3
|
||||
moe_intermediate_size: 2048
|
||||
topk_group: 4
|
||||
n_group: 8
|
||||
|
||||
processor:
|
||||
return_tensors: ms
|
||||
tokenizer:
|
||||
unk_token: '<unk>'
|
||||
bos_token: '<|begin▁of▁sentence|>'
|
||||
eos_token: '<|end▁of▁sentence|>'
|
||||
pad_token: '<|end▁of▁sentence|>'
|
||||
type: LlamaTokenizerFast
|
||||
vocab_file: '/path/to/deepseekr1/tokenizer.json'
|
||||
tokenizer_file: '/path/to/deepseekr1/tokenizer.json'
|
||||
chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}"
|
||||
type: LlamaProcessor
|
||||
@@ -40,7 +40,6 @@ def convert_model_config(configs):
|
||||
configs.mlp_has_gate = True
|
||||
configs.post_norm = True
|
||||
configs.recompute_granularity = None
|
||||
configs.ffn_concat = False
|
||||
configs.is_dynamic = True
|
||||
|
||||
parallel_config = configs.parallel_config
|
||||
|
||||
@@ -110,6 +110,7 @@ class ParallelQwenForCausalLM(LlamaPreTrainedModel):
|
||||
self.npu_mem_size = config.npu_mem_size if hasattr(config, "npu_mem_size") else 2
|
||||
if config.tie_word_embeddings:
|
||||
self.lm_head.weight = self.model.tok_embeddings.embedding_weight
|
||||
self.return_hidden_states = config.return_hidden_states
|
||||
|
||||
# pylint: disable=W0613
|
||||
def prepare_inputs_for_predict_layout(self, input_ids, **kwargs):
|
||||
@@ -130,7 +131,7 @@ class ParallelQwenForCausalLM(LlamaPreTrainedModel):
|
||||
have_prefix_keys_values = getattr(kwargs, "have_prefix_keys_values", False)
|
||||
|
||||
def get_input():
|
||||
if self.npu_mem_size > 0:
|
||||
if self.npu_mem_size >= 0:
|
||||
return None
|
||||
cache_list = []
|
||||
for _ in self.model.layers:
|
||||
@@ -175,6 +176,9 @@ class ParallelQwenForCausalLM(LlamaPreTrainedModel):
|
||||
batch_valid_length = self.reshape(batch_valid_length, (-1,))
|
||||
output = self.model(input_ids, batch_valid_length, batch_index, zactivate_len, block_tables,
|
||||
slot_mapping, prefix_keys_values, key_cache=key_cache, value_cache=value_cache)
|
||||
if self.return_hidden_states:
|
||||
output = self.reshape(output, (-1, output.shape[-1]))
|
||||
return output
|
||||
pre_gather = (not self.use_past or self.is_first_iteration) and batch_valid_length is not None
|
||||
if pre_gather:
|
||||
if not self.is_pynative:
|
||||
|
||||
@@ -2274,7 +2274,7 @@
|
||||
"signature": "(config: mindformers.models.configuration_utils.PretrainedConfig, *inputs, **kwargs)"
|
||||
},
|
||||
"mindformers.experimental.model.llama.LlamaPretrainedModel.config_class": {
|
||||
"signature": "(batch_size: int = 1, seq_length: int = 2048, hidden_size: int = 4096, num_layers: int = 32, num_heads: int = 32, n_kv_heads: Optional[int] = None, max_position_embedding: Optional[int] = None, intermediate_size: Optional[int] = None, vocab_size: int = 32000, multiple_of: int = 256, ffn_dim_multiplier: Optional[int] = None, rms_norm_eps: float = 1e-05, bos_token_id: int = 1, eos_token_id: int = 2, pad_token_id: int = 0, ignore_token_id: int = -100, theta: float = 10000.0, compute_dtype: str = 'float16', layernorm_compute_type: str = 'float32', softmax_compute_type: str = 'float32', rotary_dtype: str = 'float32', param_init_type: str = 'float16', residual_dtype: str = None, embedding_init_type=None, qkv_has_bias: bool = False, qkv_concat: bool = False, attn_proj_has_bias: bool = False, parallel_config: Union[dict, mindformers.modules.transformer.transformer.TransformerOpParallelConfig] = <mindformers.modules.transformer.transformer.TransformerOpParallelConfig object>, moe_config: Union[dict, mindformers.modules.transformer.moe.MoEConfig] = <mindformers.modules.transformer.moe.MoEConfig object>, use_past: bool = False, extend_method: str = 'None', scaling_factor: float = 1.0, is_dynamic: bool = False, use_rope_slice: bool = False, use_flash_attention: bool = False, use_ring_attention: bool = False, use_attn_mask_compression: bool = False, use_eod_attn_mask_compression: bool = False, parallel_optimizer: bool = False, fine_grain_interleave: int = 1, pp_interleave_num: int = 1, offset: int = 0, init_method_std: float = 0.01, checkpoint_name_or_path: str = '', repetition_penalty: float = 1.0, max_decode_length: int = 1024, block_size: int = 16, num_blocks: int = 512, top_k: int = 5, top_p: float = 1.0, do_sample: bool = True, quant_config: dict = None, tie_word_embeddings: bool = False, llm_backend: str = '', fused_rms_norm: bool = True, input_sliced_sig: bool = False, rmsnorm_compute_2d: bool = False, chunk_prefill: bool = False, calculate_per_token_loss: bool = False, pipeline_stage: dict = None, **kwargs)"
|
||||
"signature": "(batch_size: int = 1, seq_length: int = 2048, hidden_size: int = 4096, num_layers: int = 32, num_heads: int = 32, n_kv_heads: Optional[int] = None, max_position_embedding: Optional[int] = None, intermediate_size: Optional[int] = None, vocab_size: int = 32000, multiple_of: int = 256, ffn_dim_multiplier: Optional[int] = None, rms_norm_eps: float = 1e-05, bos_token_id: int = 1, eos_token_id: int = 2, pad_token_id: int = 0, ignore_token_id: int = -100, theta: float = 10000.0, compute_dtype: str = 'float16', layernorm_compute_type: str = 'float32', softmax_compute_type: str = 'float32', rotary_dtype: str = 'float32', param_init_type: str = 'float16', residual_dtype: str = None, embedding_init_type=None, qkv_has_bias: bool = False, qkv_concat: bool = False, attn_proj_has_bias: bool = False, parallel_config: Union[dict, mindformers.modules.transformer.transformer.TransformerOpParallelConfig] = <mindformers.modules.transformer.transformer.TransformerOpParallelConfig object>, moe_config: Union[dict, mindformers.modules.transformer.moe.MoEConfig] = <mindformers.modules.transformer.moe.MoEConfig object>, use_past: bool = False, extend_method: str = 'None', scaling_factor: float = 1.0, is_dynamic: bool = False, use_rope_slice: bool = False, use_flash_attention: bool = False, use_ring_attention: bool = False, use_attn_mask_compression: bool = False, use_eod_attn_mask_compression: bool = False, parallel_optimizer: bool = False, fine_grain_interleave: int = 1, pp_interleave_num: int = 1, offset: int = 0, init_method_std: float = 0.01, checkpoint_name_or_path: str = '', repetition_penalty: float = 1.0, max_decode_length: int = 1024, block_size: int = 16, num_blocks: int = 512, top_k: int = 5, top_p: float = 1.0, do_sample: bool = True, quant_config: dict = None, tie_word_embeddings: bool = False, llm_backend: str = '', fused_rms_norm: bool = True, input_sliced_sig: bool = False, rmsnorm_compute_2d: bool = False, chunk_prefill: bool = False, calculate_per_token_loss: bool = False, pipeline_stage: dict = None, return_hidden_states: bool = False, **kwargs)"
|
||||
},
|
||||
"mindformers.experimental.model.llama.llama.LlamaForCausalLM": {
|
||||
"signature": "(config: mindformers.models.llama.llama_config.LlamaConfig, num_tokentypes: int = 0, parallel_output: bool = True, pre_process: bool = True, post_process: bool = True, **kwargs)"
|
||||
@@ -2304,7 +2304,7 @@
|
||||
"signature": "(config: mindformers.models.configuration_utils.PretrainedConfig, *inputs, **kwargs)"
|
||||
},
|
||||
"mindformers.experimental.model.llama.llama.LlamaPretrainedModel.config_class": {
|
||||
"signature": "(batch_size: int = 1, seq_length: int = 2048, hidden_size: int = 4096, num_layers: int = 32, num_heads: int = 32, n_kv_heads: Optional[int] = None, max_position_embedding: Optional[int] = None, intermediate_size: Optional[int] = None, vocab_size: int = 32000, multiple_of: int = 256, ffn_dim_multiplier: Optional[int] = None, rms_norm_eps: float = 1e-05, bos_token_id: int = 1, eos_token_id: int = 2, pad_token_id: int = 0, ignore_token_id: int = -100, theta: float = 10000.0, compute_dtype: str = 'float16', layernorm_compute_type: str = 'float32', softmax_compute_type: str = 'float32', rotary_dtype: str = 'float32', param_init_type: str = 'float16', residual_dtype: str = None, embedding_init_type=None, qkv_has_bias: bool = False, qkv_concat: bool = False, attn_proj_has_bias: bool = False, parallel_config: Union[dict, mindformers.modules.transformer.transformer.TransformerOpParallelConfig] = <mindformers.modules.transformer.transformer.TransformerOpParallelConfig object>, moe_config: Union[dict, mindformers.modules.transformer.moe.MoEConfig] = <mindformers.modules.transformer.moe.MoEConfig object>, use_past: bool = False, extend_method: str = 'None', scaling_factor: float = 1.0, is_dynamic: bool = False, use_rope_slice: bool = False, use_flash_attention: bool = False, use_ring_attention: bool = False, use_attn_mask_compression: bool = False, use_eod_attn_mask_compression: bool = False, parallel_optimizer: bool = False, fine_grain_interleave: int = 1, pp_interleave_num: int = 1, offset: int = 0, init_method_std: float = 0.01, checkpoint_name_or_path: str = '', repetition_penalty: float = 1.0, max_decode_length: int = 1024, block_size: int = 16, num_blocks: int = 512, top_k: int = 5, top_p: float = 1.0, do_sample: bool = True, quant_config: dict = None, tie_word_embeddings: bool = False, llm_backend: str = '', fused_rms_norm: bool = True, input_sliced_sig: bool = False, rmsnorm_compute_2d: bool = False, chunk_prefill: bool = False, calculate_per_token_loss: bool = False, pipeline_stage: dict = None, **kwargs)"
|
||||
"signature": "(batch_size: int = 1, seq_length: int = 2048, hidden_size: int = 4096, num_layers: int = 32, num_heads: int = 32, n_kv_heads: Optional[int] = None, max_position_embedding: Optional[int] = None, intermediate_size: Optional[int] = None, vocab_size: int = 32000, multiple_of: int = 256, ffn_dim_multiplier: Optional[int] = None, rms_norm_eps: float = 1e-05, bos_token_id: int = 1, eos_token_id: int = 2, pad_token_id: int = 0, ignore_token_id: int = -100, theta: float = 10000.0, compute_dtype: str = 'float16', layernorm_compute_type: str = 'float32', softmax_compute_type: str = 'float32', rotary_dtype: str = 'float32', param_init_type: str = 'float16', residual_dtype: str = None, embedding_init_type=None, qkv_has_bias: bool = False, qkv_concat: bool = False, attn_proj_has_bias: bool = False, parallel_config: Union[dict, mindformers.modules.transformer.transformer.TransformerOpParallelConfig] = <mindformers.modules.transformer.transformer.TransformerOpParallelConfig object>, moe_config: Union[dict, mindformers.modules.transformer.moe.MoEConfig] = <mindformers.modules.transformer.moe.MoEConfig object>, use_past: bool = False, extend_method: str = 'None', scaling_factor: float = 1.0, is_dynamic: bool = False, use_rope_slice: bool = False, use_flash_attention: bool = False, use_ring_attention: bool = False, use_attn_mask_compression: bool = False, use_eod_attn_mask_compression: bool = False, parallel_optimizer: bool = False, fine_grain_interleave: int = 1, pp_interleave_num: int = 1, offset: int = 0, init_method_std: float = 0.01, checkpoint_name_or_path: str = '', repetition_penalty: float = 1.0, max_decode_length: int = 1024, block_size: int = 16, num_blocks: int = 512, top_k: int = 5, top_p: float = 1.0, do_sample: bool = True, quant_config: dict = None, tie_word_embeddings: bool = False, llm_backend: str = '', fused_rms_norm: bool = True, input_sliced_sig: bool = False, rmsnorm_compute_2d: bool = False, chunk_prefill: bool = False, calculate_per_token_loss: bool = False, pipeline_stage: dict = None, return_hidden_states: bool = False, **kwargs)"
|
||||
},
|
||||
"mindformers.experimental.models.Qwen2ForCausalLM": {
|
||||
"signature": "(config)"
|
||||
@@ -4119,7 +4119,7 @@
|
||||
"signature": "(self, conversation, return_tensors=None, **tokenizer_kwargs)"
|
||||
},
|
||||
"mindformers.models.LlamaConfig": {
|
||||
"signature": "(batch_size: int = 1, seq_length: int = 2048, hidden_size: int = 4096, num_layers: int = 32, num_heads: int = 32, n_kv_heads: Optional[int] = None, max_position_embedding: Optional[int] = None, intermediate_size: Optional[int] = None, vocab_size: int = 32000, multiple_of: int = 256, ffn_dim_multiplier: Optional[int] = None, rms_norm_eps: float = 1e-05, bos_token_id: int = 1, eos_token_id: int = 2, pad_token_id: int = 0, ignore_token_id: int = -100, theta: float = 10000.0, compute_dtype: str = 'float16', layernorm_compute_type: str = 'float32', softmax_compute_type: str = 'float32', rotary_dtype: str = 'float32', param_init_type: str = 'float16', residual_dtype: str = None, embedding_init_type=None, qkv_has_bias: bool = False, qkv_concat: bool = False, attn_proj_has_bias: bool = False, parallel_config: Union[dict, mindformers.modules.transformer.transformer.TransformerOpParallelConfig] = <mindformers.modules.transformer.transformer.TransformerOpParallelConfig object>, moe_config: Union[dict, mindformers.modules.transformer.moe.MoEConfig] = <mindformers.modules.transformer.moe.MoEConfig object>, use_past: bool = False, extend_method: str = 'None', scaling_factor: float = 1.0, is_dynamic: bool = False, use_rope_slice: bool = False, use_flash_attention: bool = False, use_ring_attention: bool = False, use_attn_mask_compression: bool = False, use_eod_attn_mask_compression: bool = False, parallel_optimizer: bool = False, fine_grain_interleave: int = 1, pp_interleave_num: int = 1, offset: int = 0, init_method_std: float = 0.01, checkpoint_name_or_path: str = '', repetition_penalty: float = 1.0, max_decode_length: int = 1024, block_size: int = 16, num_blocks: int = 512, top_k: int = 5, top_p: float = 1.0, do_sample: bool = True, quant_config: dict = None, tie_word_embeddings: bool = False, llm_backend: str = '', fused_rms_norm: bool = True, input_sliced_sig: bool = False, rmsnorm_compute_2d: bool = False, chunk_prefill: bool = False, calculate_per_token_loss: bool = False, pipeline_stage: dict = None, **kwargs)"
|
||||
"signature": "(batch_size: int = 1, seq_length: int = 2048, hidden_size: int = 4096, num_layers: int = 32, num_heads: int = 32, n_kv_heads: Optional[int] = None, max_position_embedding: Optional[int] = None, intermediate_size: Optional[int] = None, vocab_size: int = 32000, multiple_of: int = 256, ffn_dim_multiplier: Optional[int] = None, rms_norm_eps: float = 1e-05, bos_token_id: int = 1, eos_token_id: int = 2, pad_token_id: int = 0, ignore_token_id: int = -100, theta: float = 10000.0, compute_dtype: str = 'float16', layernorm_compute_type: str = 'float32', softmax_compute_type: str = 'float32', rotary_dtype: str = 'float32', param_init_type: str = 'float16', residual_dtype: str = None, embedding_init_type=None, qkv_has_bias: bool = False, qkv_concat: bool = False, attn_proj_has_bias: bool = False, parallel_config: Union[dict, mindformers.modules.transformer.transformer.TransformerOpParallelConfig] = <mindformers.modules.transformer.transformer.TransformerOpParallelConfig object>, moe_config: Union[dict, mindformers.modules.transformer.moe.MoEConfig] = <mindformers.modules.transformer.moe.MoEConfig object>, use_past: bool = False, extend_method: str = 'None', scaling_factor: float = 1.0, is_dynamic: bool = False, use_rope_slice: bool = False, use_flash_attention: bool = False, use_ring_attention: bool = False, use_attn_mask_compression: bool = False, use_eod_attn_mask_compression: bool = False, parallel_optimizer: bool = False, fine_grain_interleave: int = 1, pp_interleave_num: int = 1, offset: int = 0, init_method_std: float = 0.01, checkpoint_name_or_path: str = '', repetition_penalty: float = 1.0, max_decode_length: int = 1024, block_size: int = 16, num_blocks: int = 512, top_k: int = 5, top_p: float = 1.0, do_sample: bool = True, quant_config: dict = None, tie_word_embeddings: bool = False, llm_backend: str = '', fused_rms_norm: bool = True, input_sliced_sig: bool = False, rmsnorm_compute_2d: bool = False, chunk_prefill: bool = False, calculate_per_token_loss: bool = False, pipeline_stage: dict = None, return_hidden_states: bool = False, **kwargs)"
|
||||
},
|
||||
"mindformers.models.LlamaForCausalLM": {
|
||||
"signature": "(config: mindformers.models.llama.llama_config.LlamaConfig = None)"
|
||||
@@ -5622,7 +5622,7 @@
|
||||
"signature": "(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]"
|
||||
},
|
||||
"mindformers.models.llama.LlamaConfig": {
|
||||
"signature": "(batch_size: int = 1, seq_length: int = 2048, hidden_size: int = 4096, num_layers: int = 32, num_heads: int = 32, n_kv_heads: Optional[int] = None, max_position_embedding: Optional[int] = None, intermediate_size: Optional[int] = None, vocab_size: int = 32000, multiple_of: int = 256, ffn_dim_multiplier: Optional[int] = None, rms_norm_eps: float = 1e-05, bos_token_id: int = 1, eos_token_id: int = 2, pad_token_id: int = 0, ignore_token_id: int = -100, theta: float = 10000.0, compute_dtype: str = 'float16', layernorm_compute_type: str = 'float32', softmax_compute_type: str = 'float32', rotary_dtype: str = 'float32', param_init_type: str = 'float16', residual_dtype: str = None, embedding_init_type=None, qkv_has_bias: bool = False, qkv_concat: bool = False, attn_proj_has_bias: bool = False, parallel_config: Union[dict, mindformers.modules.transformer.transformer.TransformerOpParallelConfig] = <mindformers.modules.transformer.transformer.TransformerOpParallelConfig object>, moe_config: Union[dict, mindformers.modules.transformer.moe.MoEConfig] = <mindformers.modules.transformer.moe.MoEConfig object>, use_past: bool = False, extend_method: str = 'None', scaling_factor: float = 1.0, is_dynamic: bool = False, use_rope_slice: bool = False, use_flash_attention: bool = False, use_ring_attention: bool = False, use_attn_mask_compression: bool = False, use_eod_attn_mask_compression: bool = False, parallel_optimizer: bool = False, fine_grain_interleave: int = 1, pp_interleave_num: int = 1, offset: int = 0, init_method_std: float = 0.01, checkpoint_name_or_path: str = '', repetition_penalty: float = 1.0, max_decode_length: int = 1024, block_size: int = 16, num_blocks: int = 512, top_k: int = 5, top_p: float = 1.0, do_sample: bool = True, quant_config: dict = None, tie_word_embeddings: bool = False, llm_backend: str = '', fused_rms_norm: bool = True, input_sliced_sig: bool = False, rmsnorm_compute_2d: bool = False, chunk_prefill: bool = False, calculate_per_token_loss: bool = False, pipeline_stage: dict = None, **kwargs)"
|
||||
"signature": "(batch_size: int = 1, seq_length: int = 2048, hidden_size: int = 4096, num_layers: int = 32, num_heads: int = 32, n_kv_heads: Optional[int] = None, max_position_embedding: Optional[int] = None, intermediate_size: Optional[int] = None, vocab_size: int = 32000, multiple_of: int = 256, ffn_dim_multiplier: Optional[int] = None, rms_norm_eps: float = 1e-05, bos_token_id: int = 1, eos_token_id: int = 2, pad_token_id: int = 0, ignore_token_id: int = -100, theta: float = 10000.0, compute_dtype: str = 'float16', layernorm_compute_type: str = 'float32', softmax_compute_type: str = 'float32', rotary_dtype: str = 'float32', param_init_type: str = 'float16', residual_dtype: str = None, embedding_init_type=None, qkv_has_bias: bool = False, qkv_concat: bool = False, attn_proj_has_bias: bool = False, parallel_config: Union[dict, mindformers.modules.transformer.transformer.TransformerOpParallelConfig] = <mindformers.modules.transformer.transformer.TransformerOpParallelConfig object>, moe_config: Union[dict, mindformers.modules.transformer.moe.MoEConfig] = <mindformers.modules.transformer.moe.MoEConfig object>, use_past: bool = False, extend_method: str = 'None', scaling_factor: float = 1.0, is_dynamic: bool = False, use_rope_slice: bool = False, use_flash_attention: bool = False, use_ring_attention: bool = False, use_attn_mask_compression: bool = False, use_eod_attn_mask_compression: bool = False, parallel_optimizer: bool = False, fine_grain_interleave: int = 1, pp_interleave_num: int = 1, offset: int = 0, init_method_std: float = 0.01, checkpoint_name_or_path: str = '', repetition_penalty: float = 1.0, max_decode_length: int = 1024, block_size: int = 16, num_blocks: int = 512, top_k: int = 5, top_p: float = 1.0, do_sample: bool = True, quant_config: dict = None, tie_word_embeddings: bool = False, llm_backend: str = '', fused_rms_norm: bool = True, input_sliced_sig: bool = False, rmsnorm_compute_2d: bool = False, chunk_prefill: bool = False, calculate_per_token_loss: bool = False, pipeline_stage: dict = None, return_hidden_states: bool = False, **kwargs)"
|
||||
},
|
||||
"mindformers.models.llama.LlamaForCausalLM": {
|
||||
"signature": "(config: mindformers.models.llama.llama_config.LlamaConfig = None)"
|
||||
@@ -5763,7 +5763,7 @@
|
||||
"signature": "(self)"
|
||||
},
|
||||
"mindformers.models.llama.llama_config.LlamaConfig": {
|
||||
"signature": "(batch_size: int = 1, seq_length: int = 2048, hidden_size: int = 4096, num_layers: int = 32, num_heads: int = 32, n_kv_heads: Optional[int] = None, max_position_embedding: Optional[int] = None, intermediate_size: Optional[int] = None, vocab_size: int = 32000, multiple_of: int = 256, ffn_dim_multiplier: Optional[int] = None, rms_norm_eps: float = 1e-05, bos_token_id: int = 1, eos_token_id: int = 2, pad_token_id: int = 0, ignore_token_id: int = -100, theta: float = 10000.0, compute_dtype: str = 'float16', layernorm_compute_type: str = 'float32', softmax_compute_type: str = 'float32', rotary_dtype: str = 'float32', param_init_type: str = 'float16', residual_dtype: str = None, embedding_init_type=None, qkv_has_bias: bool = False, qkv_concat: bool = False, attn_proj_has_bias: bool = False, parallel_config: Union[dict, mindformers.modules.transformer.transformer.TransformerOpParallelConfig] = <mindformers.modules.transformer.transformer.TransformerOpParallelConfig object>, moe_config: Union[dict, mindformers.modules.transformer.moe.MoEConfig] = <mindformers.modules.transformer.moe.MoEConfig object>, use_past: bool = False, extend_method: str = 'None', scaling_factor: float = 1.0, is_dynamic: bool = False, use_rope_slice: bool = False, use_flash_attention: bool = False, use_ring_attention: bool = False, use_attn_mask_compression: bool = False, use_eod_attn_mask_compression: bool = False, parallel_optimizer: bool = False, fine_grain_interleave: int = 1, pp_interleave_num: int = 1, offset: int = 0, init_method_std: float = 0.01, checkpoint_name_or_path: str = '', repetition_penalty: float = 1.0, max_decode_length: int = 1024, block_size: int = 16, num_blocks: int = 512, top_k: int = 5, top_p: float = 1.0, do_sample: bool = True, quant_config: dict = None, tie_word_embeddings: bool = False, llm_backend: str = '', fused_rms_norm: bool = True, input_sliced_sig: bool = False, rmsnorm_compute_2d: bool = False, chunk_prefill: bool = False, calculate_per_token_loss: bool = False, pipeline_stage: dict = None, **kwargs)"
|
||||
"signature": "(batch_size: int = 1, seq_length: int = 2048, hidden_size: int = 4096, num_layers: int = 32, num_heads: int = 32, n_kv_heads: Optional[int] = None, max_position_embedding: Optional[int] = None, intermediate_size: Optional[int] = None, vocab_size: int = 32000, multiple_of: int = 256, ffn_dim_multiplier: Optional[int] = None, rms_norm_eps: float = 1e-05, bos_token_id: int = 1, eos_token_id: int = 2, pad_token_id: int = 0, ignore_token_id: int = -100, theta: float = 10000.0, compute_dtype: str = 'float16', layernorm_compute_type: str = 'float32', softmax_compute_type: str = 'float32', rotary_dtype: str = 'float32', param_init_type: str = 'float16', residual_dtype: str = None, embedding_init_type=None, qkv_has_bias: bool = False, qkv_concat: bool = False, attn_proj_has_bias: bool = False, parallel_config: Union[dict, mindformers.modules.transformer.transformer.TransformerOpParallelConfig] = <mindformers.modules.transformer.transformer.TransformerOpParallelConfig object>, moe_config: Union[dict, mindformers.modules.transformer.moe.MoEConfig] = <mindformers.modules.transformer.moe.MoEConfig object>, use_past: bool = False, extend_method: str = 'None', scaling_factor: float = 1.0, is_dynamic: bool = False, use_rope_slice: bool = False, use_flash_attention: bool = False, use_ring_attention: bool = False, use_attn_mask_compression: bool = False, use_eod_attn_mask_compression: bool = False, parallel_optimizer: bool = False, fine_grain_interleave: int = 1, pp_interleave_num: int = 1, offset: int = 0, init_method_std: float = 0.01, checkpoint_name_or_path: str = '', repetition_penalty: float = 1.0, max_decode_length: int = 1024, block_size: int = 16, num_blocks: int = 512, top_k: int = 5, top_p: float = 1.0, do_sample: bool = True, quant_config: dict = None, tie_word_embeddings: bool = False, llm_backend: str = '', fused_rms_norm: bool = True, input_sliced_sig: bool = False, rmsnorm_compute_2d: bool = False, chunk_prefill: bool = False, calculate_per_token_loss: bool = False, pipeline_stage: dict = None, return_hidden_states: bool = False, **kwargs)"
|
||||
},
|
||||
"mindformers.models.llama.llama_processor.LlamaProcessor": {
|
||||
"signature": "(tokenizer=None, max_length=128, padding='max_length', return_tensors='ms')"
|
||||
|
||||
Reference in New Issue
Block a user