mirror of
https://gitee.com/mindspore/mindformers.git
synced 2025-12-06 11:29:59 +08:00
!7782 【master】【cleancode】整改代码重复率过大问题
Merge pull request !7782 from zyw_hw/fix_huge_cc_1201
This commit is contained in:
@@ -157,7 +157,6 @@ from mindformers.modules import (
|
||||
AlibiTensorV2,
|
||||
Dropout,
|
||||
EmbeddingOpParallelConfig,
|
||||
FeedForward,
|
||||
FixedSparseAttention,
|
||||
LayerNorm,
|
||||
Linear,
|
||||
|
||||
@@ -28,47 +28,50 @@ __all__ = ['AdamW', 'PmaAdamW', 'Muon']
|
||||
|
||||
@MindFormerRegister.register(MindFormerModuleType.OPTIMIZER)
|
||||
class AdamW:
|
||||
r"""
|
||||
"""
|
||||
This is the implementation of AdamW.
|
||||
|
||||
.. math::
|
||||
\begin{array}{l}
|
||||
&\newline
|
||||
&\hline \\
|
||||
&\textbf{Parameters}: \: 1^{\text {st }}\text {moment vector} \: m , \: 2^{\text {nd}} \:
|
||||
\text{moment vector} \: v , \\
|
||||
&\: gradients \: g, \: \text{learning rate} \: \gamma,
|
||||
\text {exponential decay rates for the moment estimates} \: \beta_{1} \: \beta_{2} , \\
|
||||
&\:\text {parameter vector} \: w_{0}, \:\text{timestep} \: t, \: \text{weight decay} \: \lambda \\
|
||||
&\textbf{Init}: m_{0} \leftarrow 0, \: v_{0} \leftarrow 0, \: t \leftarrow 0, \:
|
||||
\text{init parameter vector} \: w_{0} \\[-1.ex]
|
||||
&\newline
|
||||
&\hline \\
|
||||
&\textbf{repeat} \\
|
||||
&\hspace{5mm} t \leftarrow t+1 \\
|
||||
&\hspace{5mm}\boldsymbol{g}_{t} \leftarrow \nabla f_{t}\left(\boldsymbol{w}_{t-1}\right) \\
|
||||
&\hspace{5mm}\boldsymbol{w}_{t} \leftarrow \boldsymbol{w}_{t-1}-\gamma\lambda\boldsymbol{w}_{t-1} \\
|
||||
&\hspace{5mm}\boldsymbol{m}_{t} \leftarrow \beta_{1} \boldsymbol{m}_{t-1}+\left(1-\beta_{1}\right)
|
||||
\boldsymbol{g}_{t} \\
|
||||
&\hspace{5mm}\boldsymbol{v}_{t} \leftarrow \beta_{2} \boldsymbol{v}_{t-1}+\left(1-\beta_{2}\right)
|
||||
\boldsymbol{g}_{t}^{2} \\
|
||||
&\hspace{5mm}\widehat{\boldsymbol{m}_{t}} \leftarrow \boldsymbol{m}_{t}/\big(1-\beta_{1}^{t} \big) \\
|
||||
&\hspace{5mm}\widehat{\boldsymbol{v}_{t}} \leftarrow \boldsymbol{v}_{t}/\big(1-\beta_{2}^{t} \big) \\
|
||||
&\hspace{5mm}\boldsymbol{w}_{t} \leftarrow \boldsymbol{w}_{t-1}-\gamma\widehat{\boldsymbol{m}_{t}}
|
||||
/\left(\sqrt{\widehat{\boldsymbol{v}_{t}}}+\epsilon\right) \\
|
||||
&\textbf{until}\text { stopping criterion is met } \\[-1.ex]
|
||||
&\newline
|
||||
&\hline \\[-1.ex]
|
||||
&\textbf{return} \: \boldsymbol{w}_{t} \\[-1.ex]
|
||||
&\newline
|
||||
&\hline \\[-1.ex]
|
||||
\end{array}
|
||||
\\begin{array}{l}
|
||||
&\\newline
|
||||
&\\hline \\\\
|
||||
&\\textbf{Parameters}: \\: 1^{\\text {st }}\\text {moment vector} \\: m , \\: 2^{\\text {nd}} \\:
|
||||
\\text{moment vector} \\: v , \\\\
|
||||
&\\: gradients \\: g, \\: \\text{learning rate} \\: \\gamma,
|
||||
\\text {exponential decay rates for the moment estimates} \\: \\beta_{1} \\: \\beta_{2} , \\\\
|
||||
&\\:\\text {parameter vector} \\: w_{0}, \\:\\text{timestep} \\: t, \\: \\text{weight decay} \\: \\lambda \\\\
|
||||
&\\textbf{Init}: m_{0} \\leftarrow 0, \\: v_{0} \\leftarrow 0, \\: t \\leftarrow 0, \\:
|
||||
\\text{init parameter vector} \\: w_{0} \\\\[-1.ex]
|
||||
&\\newline
|
||||
&\\hline \\\\
|
||||
&\\textbf{repeat} \\\\
|
||||
&\\hspace{5mm} t \\leftarrow t+1 \\\\
|
||||
&\\hspace{5mm}\\boldsymbol{g}_{t} \\leftarrow \\nabla f_{t}\\left(\\boldsymbol{w}_{t-1}\\right) \\\\
|
||||
&\\hspace{5mm}\\boldsymbol{w}_{t} \\leftarrow \\boldsymbol{w}_{t-1}-\\gamma\\lambda
|
||||
\\boldsymbol{w}_{t-1} \\\\
|
||||
&\\hspace{5mm}\\boldsymbol{m}_{t} \\leftarrow \\beta_{1} \\boldsymbol{m}_{t-1}+\\left(1-\\beta_{1}\\right)
|
||||
\\boldsymbol{g}_{t} \\\\
|
||||
&\\hspace{5mm}\\boldsymbol{v}_{t} \\leftarrow \\beta_{2} \\boldsymbol{v}_{t-1}+\\left(1-\\beta_{2}\\right)
|
||||
\\boldsymbol{g}_{t}^{2} \\\\
|
||||
&\\hspace{5mm}\\widehat{\\boldsymbol{m}_{t}} \\leftarrow \\boldsymbol{m}_{t}/
|
||||
\\big(1-\\beta_{1}^{t} \\big) \\\\
|
||||
&\\hspace{5mm}\\widehat{\\boldsymbol{v}_{t}} \\leftarrow \\boldsymbol{v}_{t}/
|
||||
\\big(1-\\beta_{2}^{t} \\big) \\\\
|
||||
&\\hspace{5mm}\\boldsymbol{w}_{t} \\leftarrow \\boldsymbol{w}_{t-1}-\\gamma\\widehat{\\boldsymbol{m}_{t}}
|
||||
/\\left(\\sqrt{\\widehat{\\boldsymbol{v}_{t}}}+\\epsilon\\right) \\\\
|
||||
&\\textbf{until}\\text { stopping criterion is met } \\\\[-1.ex]
|
||||
&\\newline
|
||||
&\\hline \\\\[-1.ex]
|
||||
&\\textbf{return} \\: \\boldsymbol{w}_{t} \\\\[-1.ex]
|
||||
&\\newline
|
||||
&\\hline \\\\[-1.ex]
|
||||
\\end{array}
|
||||
|
||||
:math:`m` represents the first moment vector moment1, :math:`v` represents the second moment vector moment2,
|
||||
:math:`\widehat{m}` represents the bias-corrected first moment vector, :math:`\widehat{v}` represents
|
||||
the bias-corrected second moment vector, :math:`g` represents gradients, :math:`\gamma` represents
|
||||
learning_rate, :math:`\beta_1`, `\beta_2` represent beta1 and beta2, :math:`t` represents the current step,
|
||||
:math:`w` represents params, and :math:`\lambda` represents weight_decay.
|
||||
:math:`\\widehat{m}` represents the bias-corrected first moment vector, :math:`\\widehat{v}` represents
|
||||
the bias-corrected second moment vector, :math:`g` represents gradients, :math:`\\gamma` represents
|
||||
learning_rate, :math:`\\beta_1`, `\\beta_2` represent beta1 and beta2, :math:`t` represents the current step,
|
||||
:math:`w` represents params, and :math:`\\lambda` represents weight_decay.
|
||||
|
||||
Args:
|
||||
params (Union[list[Parameter], list[dict]]): Must be list of `Parameter` or list of `dict`. When the
|
||||
@@ -218,7 +221,7 @@ class AdamW:
|
||||
|
||||
@MindFormerRegister.register(MindFormerModuleType.OPTIMIZER)
|
||||
class PmaAdamW:
|
||||
r"""
|
||||
"""
|
||||
This is the implementation of PmAdamW.
|
||||
|
||||
Args:
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""FusedPmaAdamW implementation"""
|
||||
import mindspore.ops as ops
|
||||
from mindspore import ops
|
||||
|
||||
from mindspore._checkparam import GT, INC_NEITHER
|
||||
from mindspore import _checkparam as validator
|
||||
@@ -76,7 +76,7 @@ def _check_param_value(fused_num, interleave_step, fused_algo, ema_alpha, prim_n
|
||||
|
||||
|
||||
class FusedPmaAdamW(FusedAdamW):
|
||||
r"""
|
||||
"""
|
||||
This is the implementation of PmaAdamW that uses fused operators.
|
||||
|
||||
Args:
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
"""MindFormers Transformers API."""
|
||||
from .transformer import (
|
||||
EmbeddingOpParallelConfig,
|
||||
FeedForward,
|
||||
LowerTriangularMaskWithDynamic,
|
||||
MoEConfig,
|
||||
OpParallelConfig,
|
||||
|
||||
@@ -49,6 +49,8 @@ from mindformers.tools.logger import logger
|
||||
from mindformers.tools.utils import is_pynative
|
||||
from mindformers.modules.activation import get_activation
|
||||
from mindformers.modules.transformer.op_parallel_config import default_dpmp_config, OpParallelConfig, MoEParallelConfig
|
||||
from mindformers.parallel_core.training_graph.base_models.common.embeddings.yarn_rotary_pos_embedding import \
|
||||
_yarn_find_correction_range
|
||||
|
||||
__all__ = [
|
||||
"FixedSparseAttention",
|
||||
@@ -177,7 +179,6 @@ class _LayerInputCheck:
|
||||
Check the input shape's is equal to the expected shape, the value on 0-th is viewed as batch, and the
|
||||
batch size will not be checked.
|
||||
"""
|
||||
target_shape = target_shape
|
||||
length, hidden = target_shape
|
||||
if isinstance(input_shape, tuple):
|
||||
input_shape = list(input_shape)
|
||||
@@ -244,11 +245,9 @@ class Dropout(nn.Cell):
|
||||
"""
|
||||
|
||||
def __init__(self, keep_prob=0.5, dtype=mstype.float32):
|
||||
super(Dropout, self).__init__()
|
||||
super().__init__()
|
||||
if keep_prob <= 0 or keep_prob > 1:
|
||||
raise ValueError(
|
||||
"dropout probability should be a number in range (0, 1], but got {}".format(
|
||||
keep_prob))
|
||||
raise ValueError(f"dropout probability should be a number in range (0, 1], but got {keep_prob}")
|
||||
Validator.check_subclass("dtype", dtype, mstype.number_type, self.cls_name)
|
||||
Validator.check_value_type('keep_prob', keep_prob, [float], self.cls_name)
|
||||
self.keep_prob = keep_prob
|
||||
@@ -269,7 +268,7 @@ class Dropout(nn.Cell):
|
||||
return out
|
||||
|
||||
def extend_repr(self):
|
||||
return 'keep_prob={}'.format(self.keep_prob)
|
||||
return f'keep_prob={self.keep_prob}'
|
||||
|
||||
def shard(self, strategy):
|
||||
self.dropout.shard(strategy)
|
||||
@@ -291,10 +290,10 @@ class LayerNorm(Cell):
|
||||
"""
|
||||
|
||||
def __init__(self, normalized_shape, eps=1e-5, param_init_type=mstype.float32, is_self_defined=False):
|
||||
super(LayerNorm, self).__init__()
|
||||
super().__init__()
|
||||
if param_init_type not in [mstype.float32, mstype.float16, mstype.bfloat16]:
|
||||
raise TypeError("The type of parameter 'param_init_type' should in [float32, float16], "
|
||||
"but got the type : {}.".format(type(param_init_type)))
|
||||
raise TypeError(f"The type of parameter 'param_init_type' should in [float32, float16], "
|
||||
f"but got the type : {type(param_init_type)}.")
|
||||
# Since the mindspore 1.10 version, the layernorm has been changed to P.LayerNorm
|
||||
self.is_self_defined = is_self_defined
|
||||
if not self.is_self_defined:
|
||||
@@ -441,7 +440,7 @@ class Linear(Cell):
|
||||
use_gmm=False,
|
||||
param_init_type=mstype.float32,
|
||||
compute_dtype=mstype.float16):
|
||||
super(Linear, self).__init__()
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
if not (isinstance(activation, str) or activation is None or issubclass(activation, nn.Cell)):
|
||||
@@ -465,6 +464,7 @@ class Linear(Cell):
|
||||
self.weight = Parameter(initializer(weight_init, [self.expert_num] + weight_shape, param_init_type),
|
||||
name="weight")
|
||||
if self.use_gmm:
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from mindspore.ops.auto_generate import GroupedMatmul
|
||||
# split_item only supports 0 and 3 now, 0 means the size of tensorlist not equal to 1,
|
||||
# 3 means the size of tensorlist is 1.
|
||||
@@ -676,7 +676,7 @@ class FixedSparseAttention(nn.Cell):
|
||||
seq_length=1024,
|
||||
num_different_global_patterns=4,
|
||||
parallel_config=default_dpmp_config):
|
||||
super(FixedSparseAttention, self).__init__()
|
||||
super().__init__()
|
||||
dp, mp = parallel_config.data_parallel, parallel_config.model_parallel
|
||||
if num_heads % mp != 0:
|
||||
raise ValueError(f"The number of heads {num_heads} must be a "
|
||||
@@ -700,17 +700,17 @@ class FixedSparseAttention(nn.Cell):
|
||||
self.parallel_config = parallel_config
|
||||
size_per_head_list = [64, 128]
|
||||
if self.seq_length != 1024:
|
||||
raise ValueError("For 'FixedSparseAttention', the class variable 'seq_length' must be 1024, "
|
||||
"but got the value : {}.".format(seq_length))
|
||||
raise ValueError(f"For 'FixedSparseAttention', the class variable 'seq_length' must be 1024, "
|
||||
f"but got the value : {seq_length}.")
|
||||
if self.block_size != 64:
|
||||
raise ValueError("For 'FixedSparseAttention', the class variable 'block_size' must be 64, "
|
||||
"but got the value : {}.".format(block_size))
|
||||
raise ValueError(f"For 'FixedSparseAttention', the class variable 'block_size' must be 64, "
|
||||
f"but got the value : {block_size}.")
|
||||
if num_different_global_patterns != 4:
|
||||
raise ValueError("For 'FixedSparseAttention', the class variable 'num_different_global_patterns' "
|
||||
"must be 4, but got the value : {}".format(num_different_global_patterns))
|
||||
raise ValueError(f"For 'FixedSparseAttention', the class variable 'num_different_global_patterns' "
|
||||
f"must be 4, but got the value : {num_different_global_patterns}")
|
||||
if self.size_per_head not in size_per_head_list:
|
||||
raise ValueError("For 'FixedSparseAttention', the class variable 'size_per_head' only supports {}, "
|
||||
"but got the value : {}.".format(size_per_head_list, self.size_per_head))
|
||||
raise ValueError(f"For 'FixedSparseAttention', the class variable 'size_per_head' "
|
||||
f"only supports {size_per_head_list}, but got the value : {self.size_per_head}.")
|
||||
local_ones = np.ones((self.block_size, self.block_size),
|
||||
dtype=np.float16)
|
||||
global_mask_original = np.ones((self.seq_length, self.global_size), dtype=np.float16)
|
||||
@@ -851,7 +851,7 @@ class AlibiTensor(nn.Cell):
|
||||
"""
|
||||
|
||||
def __init__(self, seq_length, num_heads, parallel_config=default_dpmp_config):
|
||||
super(AlibiTensor, self).__init__()
|
||||
super().__init__()
|
||||
dp = parallel_config.data_parallel
|
||||
|
||||
self.seq_length = seq_length
|
||||
@@ -915,7 +915,7 @@ class AlibiTensorV2(nn.Cell):
|
||||
"""
|
||||
|
||||
def __init__(self, num_heads):
|
||||
super(AlibiTensorV2, self).__init__()
|
||||
super().__init__()
|
||||
self.num_heads = num_heads
|
||||
|
||||
self.expand_2d = P.ExpandDims()
|
||||
@@ -1124,22 +1124,6 @@ def _check_linear_scaling_factor(scaling_factor):
|
||||
raise ValueError(f"`scaling_factor`'s factor field must be a float >= 1, got {factor}")
|
||||
|
||||
|
||||
def _yarn_find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048):
|
||||
"""Inverse dim formula to find dim based on number of rotations"""
|
||||
return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
|
||||
|
||||
|
||||
def _yarn_find_correction_range(low_rot, high_rot, dim, base=10000, max_position_embeddings=2048):
|
||||
"""Find dim range bounds based on rotations"""
|
||||
low = math.floor(
|
||||
_yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
|
||||
)
|
||||
high = math.ceil(
|
||||
_yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
|
||||
)
|
||||
return max(low, 0), min(high, dim - 1) # Clamp values just in case
|
||||
|
||||
|
||||
def _yarn_get_mscale(scale=1, mscale=1):
|
||||
if scale <= 1:
|
||||
return 1.0
|
||||
|
||||
@@ -221,14 +221,14 @@ class Quantizer(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def _process_model_after_weight_loading(self, model, **kwargs):
|
||||
pass
|
||||
return model
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def is_serializable(self):
|
||||
pass
|
||||
return False
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def is_trainable(self):
|
||||
pass
|
||||
return False
|
||||
|
||||
@@ -52,19 +52,9 @@ class PtqQuantizer(Quantizer):
|
||||
def _process_model_before_weight_loading(
|
||||
self, model: "PreTrainedModel", **kwargs
|
||||
):
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from mindspore_gs.ptq import PTQ
|
||||
ptq = PTQ(config=self.quant_config, layer_policies=self.layer_policies)
|
||||
model = ptq.apply(model)
|
||||
model = ptq.convert(model)
|
||||
return model
|
||||
|
||||
def _process_model_after_weight_loading(self, model, **kwargs):
|
||||
return model
|
||||
|
||||
@property
|
||||
def is_serializable(self):
|
||||
return False
|
||||
|
||||
@property
|
||||
def is_trainable(self):
|
||||
return False
|
||||
|
||||
@@ -65,14 +65,3 @@ class RtnQuantizer(Quantizer):
|
||||
model = ptq.apply(model)
|
||||
model = ptq.convert(model)
|
||||
return model
|
||||
|
||||
def _process_model_after_weight_loading(self, model, **kwargs):
|
||||
return model
|
||||
|
||||
@property
|
||||
def is_serializable(self):
|
||||
return False
|
||||
|
||||
@property
|
||||
def is_trainable(self):
|
||||
return False
|
||||
|
||||
@@ -21,7 +21,6 @@ This is an experimental interface that is subject to change or deletion.
|
||||
|
||||
from .transformer import (
|
||||
EmbeddingOpParallelConfig,
|
||||
FeedForward,
|
||||
LowerTriangularMaskWithDynamic,
|
||||
TransformerOpParallelConfig,
|
||||
TransformerRecomputeConfig,
|
||||
|
||||
@@ -26,7 +26,6 @@ import mindspore as ms
|
||||
from mindspore.common.tensor import Tensor
|
||||
from mindspore.common.parameter import Parameter
|
||||
from mindspore.common.initializer import Zero
|
||||
from mindspore import nn
|
||||
import mindspore.common.dtype as mstype
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.ops import functional as F
|
||||
@@ -40,18 +39,14 @@ except ImportError:
|
||||
from mindspore import log as logger
|
||||
from mindspore.parallel._utils import _get_parallel_mode, _is_sharding_propagation
|
||||
from mindspore.context import ParallelMode
|
||||
from mindformers.modules.layers import Linear, _args_type_validator_check, _valid_type_checks, _valid_value_checks, \
|
||||
_check_input_dtype
|
||||
from mindformers.modules.transformer.op_parallel_config import default_dpmp_config, _PipeLineConfig, OpParallelConfig, \
|
||||
_Config, _check_config, MoEParallelConfig
|
||||
from mindformers.version_control import get_dropout
|
||||
from mindformers.modules.transformer.op_parallel_config import _PipeLineConfig, OpParallelConfig, \
|
||||
_Config, MoEParallelConfig
|
||||
|
||||
from mindformers.tools.logger import _LogActionOnce
|
||||
from mindformers.tools.utils import is_pynative
|
||||
|
||||
__all__ = [
|
||||
"LowerTriangularMaskWithDynamic",
|
||||
"FeedForward",
|
||||
"TransformerOpParallelConfig",
|
||||
"EmbeddingOpParallelConfig",
|
||||
"TransformerRecomputeConfig",
|
||||
@@ -211,7 +206,7 @@ class TransformerSwapConfig(_Config):
|
||||
if isinstance(layer_swap, dict):
|
||||
layer_swap = [layer_swap]
|
||||
if self._validate_layers_consistency(layer_swap):
|
||||
return [dict(backward_prefetch=layer_swap[0][self.backward_prefetch], layers=True)]
|
||||
return [{"backward_prefetch": layer_swap[0][self.backward_prefetch], "layers": True}]
|
||||
return layer_swap
|
||||
|
||||
def _initialize_op_swap(self, op_swap):
|
||||
@@ -225,7 +220,7 @@ class TransformerSwapConfig(_Config):
|
||||
op_swap_dict = self.op_swap_to_dict(op_swap)
|
||||
for k, v in op_swap_dict.items():
|
||||
if self._validate_layers_consistency(v, mode=f'op_swap: {k}'):
|
||||
op_swap_dict[k] = [dict(backward_prefetch=v[0][self.backward_prefetch], layers=True)]
|
||||
op_swap_dict[k] = [{"backward_prefetch": v[0][self.backward_prefetch], "layers": True}]
|
||||
return op_swap_dict
|
||||
|
||||
def _validate_layers_consistency(self, layer_swap, mode='layer_swap'):
|
||||
@@ -283,17 +278,17 @@ class TransformerSwapConfig(_Config):
|
||||
"""Adds an operation swap configuration to the dictionary."""
|
||||
if key in dic:
|
||||
dic[key].append(
|
||||
dict(
|
||||
layers=item.get(self.layers),
|
||||
backward_prefetch=item.get(self.backward_prefetch)
|
||||
)
|
||||
{
|
||||
'layers': item.get(self.layers),
|
||||
'backward_prefetch': item.get(self.backward_prefetch)
|
||||
}
|
||||
)
|
||||
else:
|
||||
dic[key] = [
|
||||
dict(
|
||||
layers=item.get(self.layers),
|
||||
backward_prefetch=item.get(self.backward_prefetch)
|
||||
)
|
||||
{
|
||||
'layers': item.get(self.layers),
|
||||
'backward_prefetch': item.get(self.backward_prefetch)
|
||||
}
|
||||
]
|
||||
return dic
|
||||
|
||||
@@ -507,9 +502,9 @@ class ContextParallelAlgo(Enum):
|
||||
Args:
|
||||
Enum (str): chosses context parallel type
|
||||
"""
|
||||
colossalai_cp = "colossalai_cp"
|
||||
ulysses_cp = "ulysses_cp"
|
||||
hybrid_cp = "hybrid_cp"
|
||||
COLOSSALAI_CP = "colossalai_cp"
|
||||
ULYSSES_CP = "ulysses_cp"
|
||||
HYBRID_CP = "hybrid_cp"
|
||||
|
||||
|
||||
default_transformer_swap_config = TransformerSwapConfig()
|
||||
@@ -601,7 +596,7 @@ class TransformerOpParallelConfig(_Config):
|
||||
ValueError: in hybrid_cp algorithm, context_parallel should be divisible by ulysses_degree_in_cp
|
||||
"""
|
||||
if self.context_parallel == 1:
|
||||
if self.context_parallel_algo != ContextParallelAlgo.colossalai_cp:
|
||||
if self.context_parallel_algo != ContextParallelAlgo.COLOSSALAI_CP:
|
||||
logger.warning(f"context_parallel_algo {self.context_parallel_algo.value} will not take effect "
|
||||
"when context_parallel == 1.")
|
||||
if self.ulysses_degree_in_cp > 1:
|
||||
@@ -610,10 +605,10 @@ class TransformerOpParallelConfig(_Config):
|
||||
return
|
||||
|
||||
# here context parallel > 1
|
||||
if self.context_parallel_algo != ContextParallelAlgo.hybrid_cp and self.ulysses_degree_in_cp > 1:
|
||||
if self.context_parallel_algo != ContextParallelAlgo.HYBRID_CP and self.ulysses_degree_in_cp > 1:
|
||||
logger.warning(f"ulysses_degree_in_cp {self.ulysses_degree_in_cp} will not take effect when "
|
||||
f"context_parallel_algo {self.context_parallel_algo.value} is not `hybrid_cp`.")
|
||||
if (self.context_parallel_algo == ContextParallelAlgo.hybrid_cp and
|
||||
if (self.context_parallel_algo == ContextParallelAlgo.HYBRID_CP and
|
||||
self.context_parallel % self.ulysses_degree_in_cp != 0):
|
||||
raise ValueError(f"When using hybrid_cp algorithm, context_parallel {self.context_parallel} "
|
||||
f"should be divisible by ulysses_degree_in_cp {self.ulysses_degree_in_cp}. "
|
||||
@@ -627,9 +622,9 @@ class TransformerOpParallelConfig(_Config):
|
||||
"""
|
||||
if self.context_parallel == 1:
|
||||
return 1
|
||||
if self.context_parallel_algo == ContextParallelAlgo.colossalai_cp:
|
||||
if self.context_parallel_algo == ContextParallelAlgo.COLOSSALAI_CP:
|
||||
return 1
|
||||
if self.context_parallel_algo == ContextParallelAlgo.ulysses_cp:
|
||||
if self.context_parallel_algo == ContextParallelAlgo.ULYSSES_CP:
|
||||
return self.context_parallel
|
||||
# hybird
|
||||
return self.ulysses_degree_in_cp
|
||||
@@ -786,260 +781,6 @@ class TransformerOpParallelConfig(_Config):
|
||||
default_transformer_config = TransformerOpParallelConfig()
|
||||
|
||||
|
||||
class FeedForward(Cell):
|
||||
r"""
|
||||
The multilayer perceptron with two linear layers with dropout applied at final output. The first linear
|
||||
will project the input dimension from hidden_size to ffn_hidden_size. The second linear will project the
|
||||
dimension from ffn_hidden_size to hidden_size. The first linear is sharded on the relative dimension,
|
||||
and the second linear is sharded on the output dimension. The overview process can be:
|
||||
|
||||
.. math::
|
||||
Dropout((xW_1+b_1)W_2 + b_2)
|
||||
|
||||
where the :math:`W_1, W_2, b_1` and :math:`b_2` are trainable parameters.
|
||||
|
||||
Args:
|
||||
hidden_size (int): The dimension of the inputs.
|
||||
ffn_hidden_size (int): The intermediate hidden size.
|
||||
dropout_rate (float): The dropout rate for the second linear's output.
|
||||
hidden_act (str, nn.Cell): The activation of the internal feedforward layer. Supports 'relu',
|
||||
'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
|
||||
'hsigmoid', 'logsigmoid' and so on. User can provide custom activition to the argument.
|
||||
If user wants to run the net in the parallel mode, the custom activation must also provide
|
||||
the `activation_shard` function. Please see examples. Default: gelu.
|
||||
expert_num (int): The number of experts used in Linear. For the case expert_num > 1, BatchMatMul is used
|
||||
and the first dimension in BatchMatMul indicate expert_num. Default: 1.
|
||||
expert_group_size (int): The number of tokens in each data parallel group. Default: None. This parameter is
|
||||
effective only when in AUTO_PARALLEL mode, and NOT SHARDING_PROPAGATION.
|
||||
param_init_type (dtype.Number): The parameter initialization type. Should be mstype.float32 or
|
||||
mstype.float16. Default: mstype.float32.
|
||||
parallel_config (OpParallelConfig, MoEParallelConfig): The config of parallel setting, see
|
||||
`OpParallelConfig` or `MoEParallelConfig`. When MoE is applied, MoEParallelConfig is effective,
|
||||
otherwise OpParallelConfig is effective. Default `default_dpmp_config`,
|
||||
an instance of `OpParallelConfig` with default args.
|
||||
|
||||
Inputs:
|
||||
- **x** (Tensor) - should be `[batch, seq_length, hidden_size] or [batch * seq_length, hidden_size]`.
|
||||
Float tensor.
|
||||
|
||||
Outputs:
|
||||
Tensor, the output of this layer after mapping. The shape is `[batch, seq_length, hidden_size] or
|
||||
[batch * seq_length, hidden_size]`.
|
||||
|
||||
Raises:
|
||||
TypeError: `hidden_act` is not a string or nn.Cell.
|
||||
TypeError: `parallel_config` is not a subclass of OpParallelConfig.
|
||||
ValueError: `ffn_hidden_size` is not a multiple of the model parallel way.
|
||||
ValueError: `hidden_size` is not a multiple of the model parallel way.
|
||||
|
||||
Supported Platforms:
|
||||
``Ascend`` ``GPU``
|
||||
|
||||
Examples:
|
||||
>>> import numpy as np
|
||||
>>> from mindformers.modules.transformer import FeedForward
|
||||
>>> from mindspore import dtype as mstype
|
||||
>>> from mindspore import Tensor, nn
|
||||
>>> import mindspore.ops as ops
|
||||
>>> model = FeedForward(hidden_size=15, ffn_hidden_size=30, dropout_rate=0.1)
|
||||
>>> tensor = Tensor(np.ones((2, 20, 15)), mstype.float32)
|
||||
>>> output = model(tensor)
|
||||
>>> print(output.shape)
|
||||
(2, 20, 15)
|
||||
>>> # Example 2 using custom hidden activation
|
||||
>>> class MyActivationNoShard(nn.Cell):
|
||||
... def __init__(self):
|
||||
... super(MyActivationNoShard, self).__init__()
|
||||
... self.add = ops.Add()
|
||||
... def construct(self, x):
|
||||
... return self.add(x, 0.1)
|
||||
>>> model = FeedForward(hidden_size=15, ffn_hidden_size=30, dropout_rate=0.1,
|
||||
... hidden_act=MyActivationNoShard)
|
||||
>>> tensor = Tensor(np.ones((2, 20, 15)), mstype.float32)
|
||||
>>> output = model(tensor)
|
||||
>>> print(output.shape)
|
||||
(2, 20, 15)
|
||||
>>> # Example 3 using custom hidden activation with activation_shard
|
||||
>>> # If user wantss to run on the SEMI/AUTO parallel mode, the custom activation must provide
|
||||
>>> # a class function named activation_shard. It accepts the argument parallel_config (OpParallelConfig,
|
||||
>>> # MoEParallelConfig) and set the shard for the primitives used in the construct.
|
||||
>>> class MyActivationWithShard(nn.Cell):
|
||||
... def __init__(self):
|
||||
... super(MyActivationWithShard, self).__init__()
|
||||
... self.add = ops.Add()
|
||||
... def construct(self, x):
|
||||
... return self.add(x, 0.1)
|
||||
... def activation_shard(self, parallel_config):
|
||||
... self.add.shard(((parallel_config.data_parallel, parallel_config.model_parallel), ()))
|
||||
>>>
|
||||
>>> model = FeedForward(hidden_size=15, ffn_hidden_size=30, dropout_rate=0.1,
|
||||
... hidden_act=MyActivationWithShard)
|
||||
>>> tensor = Tensor(np.ones((2, 20, 15)), mstype.float32)
|
||||
>>> output = model(tensor)
|
||||
>>> print(output.shape)
|
||||
(2, 20, 15)
|
||||
"""
|
||||
|
||||
@_LogActionOnce(m_logger=logger, key='FeedForward',
|
||||
no_warning=_get_parallel_mode() in (ParallelMode.STAND_ALONE,))
|
||||
@_args_type_validator_check(hidden_size=Validator.check_positive_int,
|
||||
ffn_hidden_size=Validator.check_positive_int,
|
||||
dropout_rate=Validator.check_non_negative_float,
|
||||
param_init_type=_valid_value_checks([mstype.float32, mstype.bfloat16, mstype.float16],
|
||||
"FeedForward"),
|
||||
compute_dtype=_valid_value_checks([mstype.float32, mstype.bfloat16, mstype.float16],
|
||||
"FeedForward"),
|
||||
parallel_config=_valid_type_checks([OpParallelConfig, MoEParallelConfig],
|
||||
"FeedForward"))
|
||||
def __init__(self, hidden_size,
|
||||
ffn_hidden_size,
|
||||
dropout_rate,
|
||||
hidden_act='gelu',
|
||||
expert_num=1,
|
||||
expert_group_size=None,
|
||||
param_init_type=mstype.float32,
|
||||
parallel_config=default_dpmp_config,
|
||||
compute_dtype=mstype.float16):
|
||||
super(FeedForward, self).__init__()
|
||||
self.dtype = compute_dtype
|
||||
if hidden_act is None or not (isinstance(hidden_act, str) or issubclass(hidden_act, nn.Cell)):
|
||||
raise TypeError(f"For FeedForward cell, the hidden_act should str type or nn.Cell type, "
|
||||
f"but got {hidden_act}.")
|
||||
if _get_parallel_mode() in (ParallelMode.AUTO_PARALLEL,):
|
||||
_check_config(parallel_config)
|
||||
mp = parallel_config.model_parallel
|
||||
if expert_num > 1:
|
||||
ep = parallel_config.expert_parallel
|
||||
else:
|
||||
ep = 1
|
||||
# ffn use less dp than other ops when use_moe, due to there are ops use dp and ep.
|
||||
dp = parallel_config.data_parallel // ep
|
||||
if ffn_hidden_size % mp != 0:
|
||||
raise ValueError("For 'FeedForward', the class variable 'ffn_hidden_size' must be a multiple of the"
|
||||
"num of model parallel, but got the ffn_hidden_size is {} and the num of model "
|
||||
"parallel is {}.".format(ffn_hidden_size, mp))
|
||||
if hidden_size % mp != 0:
|
||||
raise ValueError("For 'FeedForward', the class variable 'hidden_size' must be a multiple of the num of "
|
||||
"model parallel, but got the hidden_size is {} and the num of model parallel is {}."
|
||||
.format(hidden_size, mp))
|
||||
if dropout_rate < 0 or dropout_rate >= 1:
|
||||
raise ValueError("For 'FeedForward', the class variable 'dropout_rate' must be in the range [0, 1.0), "
|
||||
"but got the value : {}.".format(dropout_rate))
|
||||
input_size = hidden_size
|
||||
output_size = ffn_hidden_size
|
||||
|
||||
# Project to ffn_hidden_size
|
||||
self.mapping = Linear(in_channels=input_size,
|
||||
out_channels=output_size,
|
||||
activation=hidden_act,
|
||||
transpose_b=False,
|
||||
expert_num=expert_num,
|
||||
expert_group_size=expert_group_size,
|
||||
outer_batch=dp,
|
||||
param_init_type=param_init_type,
|
||||
compute_dtype=compute_dtype)
|
||||
|
||||
# Project back to hidden_size
|
||||
self.projection = Linear(in_channels=output_size,
|
||||
out_channels=input_size,
|
||||
transpose_b=False,
|
||||
expert_num=expert_num,
|
||||
expert_group_size=expert_group_size,
|
||||
outer_batch=dp,
|
||||
param_init_type=param_init_type,
|
||||
compute_dtype=compute_dtype)
|
||||
if expert_num > 1:
|
||||
self.projection.shard(strategy_matmul=((dp, ep, 1, mp), (ep, mp, 1)))
|
||||
else:
|
||||
self.projection.shard(strategy_matmul=((dp, mp), (mp, 1)))
|
||||
self.projection.bias.parallel_optimizer = False
|
||||
self.dropout = get_dropout(dropout_rate)
|
||||
self.dropout_3d = get_dropout(dropout_rate)
|
||||
self.dropout_4d = get_dropout(dropout_rate)
|
||||
self.cast = P.Cast()
|
||||
else:
|
||||
_check_config(parallel_config)
|
||||
mp = parallel_config.model_parallel
|
||||
if expert_num > 1:
|
||||
ep = parallel_config.expert_parallel
|
||||
else:
|
||||
ep = 1
|
||||
# ffn use less dp than other ops when use_moe, due to there are ops use dp and ep.
|
||||
dp = parallel_config.data_parallel // ep
|
||||
if ffn_hidden_size % mp != 0:
|
||||
raise ValueError("For 'FeedForward', the class variable 'ffn_hidden_size' must be a multiple of the"
|
||||
"num of model parallel, but got the ffn_hidden_size is {} and the num of model "
|
||||
"parallel is {}.".format(ffn_hidden_size, mp))
|
||||
if hidden_size % mp != 0:
|
||||
raise ValueError("For 'FeedForward', the class variable 'hidden_size' must be a multiple of the num of "
|
||||
"model parallel, but got the hidden_size is {} and the num of model parallel is {}."
|
||||
.format(hidden_size, mp))
|
||||
if dropout_rate < 0 or dropout_rate >= 1:
|
||||
raise ValueError("For 'FeedForward', the class variable 'dropout_rate' must be in the range [0, 1.0), "
|
||||
"but got the value : {}.".format(dropout_rate))
|
||||
input_size = hidden_size
|
||||
output_size = ffn_hidden_size
|
||||
|
||||
# Project to ffn_hidden_size
|
||||
self.mapping = Linear(in_channels=input_size,
|
||||
out_channels=output_size,
|
||||
activation=hidden_act,
|
||||
transpose_b=False,
|
||||
expert_num=expert_num,
|
||||
expert_group_size=expert_group_size,
|
||||
outer_batch=dp,
|
||||
param_init_type=param_init_type,
|
||||
compute_dtype=compute_dtype)
|
||||
|
||||
if expert_num > 1:
|
||||
self.mapping.shard(strategy_matmul=((dp, ep, 1, 1), (ep, 1, mp)),
|
||||
strategy_bias=((dp, ep, 1, mp), (1, ep, 1, mp)),
|
||||
strategy_activation=((dp, ep, 1, mp),))
|
||||
else:
|
||||
self.mapping.shard(strategy_matmul=((dp, 1), (1, mp)),
|
||||
strategy_bias=((dp, mp), (mp,)),
|
||||
strategy_activation=((dp, mp),))
|
||||
# Project back to hidden_size
|
||||
self.projection = Linear(in_channels=output_size,
|
||||
out_channels=input_size,
|
||||
transpose_b=False,
|
||||
expert_num=expert_num,
|
||||
expert_group_size=expert_group_size,
|
||||
outer_batch=dp,
|
||||
param_init_type=param_init_type,
|
||||
compute_dtype=compute_dtype)
|
||||
if expert_num > 1:
|
||||
self.projection.shard(strategy_matmul=((dp, ep, 1, mp), (ep, mp, 1)),
|
||||
strategy_bias=((dp, ep, 1, 1), (1, ep, 1, 1)))
|
||||
else:
|
||||
self.projection.shard(strategy_matmul=((dp, mp), (mp, 1)),
|
||||
strategy_bias=((dp, 1), (1,)))
|
||||
self.projection.bias.parallel_optimizer = False
|
||||
self.dropout = get_dropout(dropout_rate)
|
||||
self.dropout_3d = get_dropout(dropout_rate)
|
||||
self.dropout_4d = get_dropout(dropout_rate)
|
||||
self.dropout.dropout.shard(((dp, 1),))
|
||||
self.dropout_3d.dropout.shard(((dp, 1, 1),))
|
||||
self.dropout_4d.dropout.shard(((dp, ep, 1, 1),))
|
||||
self.cast = P.Cast()
|
||||
|
||||
def construct(self, x):
|
||||
"""Forward process of the FeedForward"""
|
||||
_check_input_dtype(F.dtype(x), "x", [mstype.float32, mstype.float16, mstype.bfloat16], self.cls_name)
|
||||
x = self.cast(x, self.dtype)
|
||||
# returned shape is [bs, seq_length, ffn_hidden_size] or [bs * seq_length, ffn_hidden_size]
|
||||
hidden = self.mapping(x)
|
||||
output = self.projection(hidden)
|
||||
# returned shape is [bs, seq_length, ffn_hidden_size] or [bs * seq_length, ffn_hidden_size]
|
||||
if len(F.shape(output)) == 3:
|
||||
output = self.dropout_3d(output)
|
||||
elif len(F.shape(output)) == 2:
|
||||
output = self.dropout(output)
|
||||
else:
|
||||
output = self.dropout_4d(output)
|
||||
return output
|
||||
|
||||
|
||||
class LowerTriangularMaskWithDynamic(Cell):
|
||||
r"""
|
||||
Get the Strictly Lower triangular matrix from the input_ids.
|
||||
|
||||
@@ -84,6 +84,7 @@ class UnquantizedGroupedLinearMethod(GroupedLinearMethodBase):
|
||||
self.cast = P.Cast()
|
||||
self.matmul = ops.auto_generate.GroupedMatmulV4()
|
||||
|
||||
# pylint: disable=W0237
|
||||
def create_weights(self, layer: nn.Cell, num_local_experts: int,
|
||||
input_size_per_partition: int, output_partition_sizes: list[int],
|
||||
params_dtype, **extra_weight_attrs):
|
||||
@@ -216,17 +217,17 @@ class ColumnParallelGroupedLinear(GroupedLinearBase):
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = ""
|
||||
):
|
||||
super(ColumnParallelGroupedLinear, self).__init__(num_local_experts,
|
||||
input_size,
|
||||
output_size,
|
||||
skip_bias_add,
|
||||
config.params_dtype,
|
||||
quant_config=quant_config,
|
||||
prefix=prefix)
|
||||
super().__init__(num_local_experts,
|
||||
input_size,
|
||||
output_size,
|
||||
skip_bias_add,
|
||||
config.params_dtype,
|
||||
quant_config=quant_config,
|
||||
prefix=prefix)
|
||||
if stride > 1:
|
||||
raise NotImplementedError(
|
||||
"For ColumnParallelGroupedLinear, `stride > 1` is not supported for now, "
|
||||
"but got `stride={}`".format(stride))
|
||||
f"For ColumnParallelGroupedLinear, `stride > 1` is not supported for now, "
|
||||
f"but got `stride={stride}`")
|
||||
if skip_bias_add:
|
||||
raise NotImplementedError(
|
||||
"For ColumnParallelGroupedLinear, `skip_bias_add=True` is not supported for now."
|
||||
@@ -275,6 +276,7 @@ class ColumnParallelGroupedLinear(GroupedLinearBase):
|
||||
else:
|
||||
self.bias = None
|
||||
|
||||
# pylint: disable=W0237
|
||||
def construct(self, input_parallel, weight=None, group_list=None):
|
||||
"""Forward of ColumnParallelGroupedLinear."""
|
||||
if weight is None:
|
||||
@@ -386,15 +388,15 @@ class ColumnParallelGroupedLinear(GroupedLinearBase):
|
||||
|
||||
|
||||
class RowParallelGroupedLinear(GroupedLinearBase):
|
||||
r"""
|
||||
"""
|
||||
The group linear layer with weight sliced on first dimension by tensor parallel size.
|
||||
This layer implements the operation as:
|
||||
|
||||
.. math::
|
||||
\text{outputs} = \text{inputs} * \text{weight} + \text{bias},
|
||||
\\text{outputs} = \\text{inputs} * \\text{weight} + \\text{bias},
|
||||
|
||||
where :math:`inputs` is the input tensors, :math:`\text{weight}` is a weight matrix created by the layer,
|
||||
and :math:`\text{bias}` is a bias vector created by the layer (only if has_bias is True).
|
||||
where :math:`inputs` is the input tensors, :math:`\\text{weight}` is a weight matrix created by the layer,
|
||||
and :math:`\\text{bias}` is a bias vector created by the layer (only if has_bias is True).
|
||||
|
||||
Args:
|
||||
num_local_experts (int): The number of local expert.
|
||||
@@ -416,11 +418,11 @@ class RowParallelGroupedLinear(GroupedLinearBase):
|
||||
prefix (str): The prefix string for this linear layer. Default: empty string("").
|
||||
|
||||
Inputs:
|
||||
- **x** (Tensor) - Tensor of shape :math:`(*, in\_channels)`. The `input_size` in `Args` should be equal
|
||||
to :math:`in\_channels` in `Inputs`.
|
||||
- **x** (Tensor) - Tensor of shape :math:`(*, in\\_channels)`. The `input_size` in `Args` should be equal
|
||||
to :math:`in\\_channels` in `Inputs`.
|
||||
|
||||
Outputs:
|
||||
Tensor of shape :math:`(*, out\_channels)`.
|
||||
Tensor of shape :math:`(*, out\\_channels)`.
|
||||
|
||||
Supported Platforms:
|
||||
``Ascend``
|
||||
@@ -445,17 +447,17 @@ class RowParallelGroupedLinear(GroupedLinearBase):
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = ""
|
||||
):
|
||||
super(RowParallelGroupedLinear, self).__init__(num_local_experts,
|
||||
input_size,
|
||||
output_size,
|
||||
skip_bias_add,
|
||||
config.params_dtype,
|
||||
quant_config=quant_config,
|
||||
prefix=prefix)
|
||||
super().__init__(num_local_experts,
|
||||
input_size,
|
||||
output_size,
|
||||
skip_bias_add,
|
||||
config.params_dtype,
|
||||
quant_config=quant_config,
|
||||
prefix=prefix)
|
||||
if stride > 1:
|
||||
raise NotImplementedError(
|
||||
"For RowParallelGroupedLinear, `stride > 1` is not supported for now, "
|
||||
"but got `stride={}`".format(stride))
|
||||
f"For RowParallelGroupedLinear, `stride > 1` is not supported for now, "
|
||||
f"but got `stride={stride}`")
|
||||
if not is_expert:
|
||||
raise NotImplementedError(
|
||||
"For RowParallelGroupedLinear, `is_expert=False` is not supported for now.")
|
||||
@@ -502,6 +504,7 @@ class RowParallelGroupedLinear(GroupedLinearBase):
|
||||
else:
|
||||
self.bias = None
|
||||
|
||||
# pylint: disable=W0237
|
||||
def construct(self, input_, weight=None, group_list=None):
|
||||
"""Forward of RowParallelGroupedLinear."""
|
||||
if weight is None:
|
||||
|
||||
@@ -20,12 +20,15 @@ __all__ = [
|
||||
"update_comm_config",
|
||||
]
|
||||
|
||||
import os
|
||||
import stat
|
||||
from contextlib import contextmanager
|
||||
import numpy as np
|
||||
|
||||
import mindspore as ms
|
||||
from mindspore import Tensor, ops, Parameter, mint
|
||||
from mindspore.communication import get_group_size
|
||||
from mindspore.train.node_strategy_pb2 import ParallelStrategyMap as ckpt_strategy
|
||||
|
||||
from mindformers.version_control import is_310p
|
||||
from mindformers.parallel_core.transformer_config import TransformerConfig
|
||||
@@ -65,7 +68,7 @@ ATTNMASK_FUNC_MAP = {
|
||||
|
||||
|
||||
def get_attn_mask_func(mask_func_type):
|
||||
r"""
|
||||
"""
|
||||
Get attention mask function.
|
||||
|
||||
Args:
|
||||
@@ -75,9 +78,9 @@ def get_attn_mask_func(mask_func_type):
|
||||
Function, the attention mask function.
|
||||
"""
|
||||
if mask_func_type not in ATTNMASK_FUNC_MAP:
|
||||
raise KeyError("Invalid attention mask function. Supported attention "
|
||||
"mask function are ['attn_mask_fill', 'attn_mask_add'] "
|
||||
", but got {}.".format(mask_func_type))
|
||||
raise KeyError(f"Invalid attention mask function. Supported attention "
|
||||
f"mask function are ['attn_mask_fill', 'attn_mask_add'] "
|
||||
f", but got {mask_func_type}.")
|
||||
return ATTNMASK_FUNC_MAP[mask_func_type]
|
||||
|
||||
|
||||
@@ -158,7 +161,7 @@ def create_empty_parameter(shape, *, dtype=None, device=None, **kwargs):
|
||||
def ensure_divisibility(numerator, denominator):
|
||||
"""Ensure that numerator is divisible by the denominator."""
|
||||
if numerator % denominator != 0:
|
||||
raise ValueError("{} is not divisible by {}".format(numerator, denominator))
|
||||
raise ValueError(f"{numerator} is not divisible by {denominator}")
|
||||
|
||||
|
||||
def divide(numerator, denominator):
|
||||
@@ -178,10 +181,6 @@ def save_strategy_file(state_dict, strategy_file_name):
|
||||
Supported Platforms:
|
||||
``Ascend``
|
||||
"""
|
||||
import os
|
||||
import stat
|
||||
from mindspore.train.node_strategy_pb2 import ParallelStrategyMap as ckpt_strategy
|
||||
|
||||
stra = ckpt_strategy()
|
||||
|
||||
stage_rank_size = state_dict["stage_rank_size"]
|
||||
@@ -361,12 +360,13 @@ def get_num_layers_and_offset(config):
|
||||
return int(layer_list[pp_rank]), int(sum(layer_list[:pp_rank]))
|
||||
return num_layers, 0
|
||||
|
||||
|
||||
def use_ms_custom_ops():
|
||||
"""
|
||||
Determine whether has custom ops
|
||||
"""
|
||||
try:
|
||||
# pylint: disable=W0611
|
||||
# pylint: disable=W0611, C0415
|
||||
import ms_custom_ops
|
||||
except ModuleNotFoundError:
|
||||
# environment need install ms_custom_ops package
|
||||
|
||||
@@ -29,7 +29,8 @@ from mindspore.parallel._utils import _get_parallel_mode, _is_sharding_propagati
|
||||
from mindspore import ops
|
||||
|
||||
from mindformers.parallel_core.training_graph.loss_func import CrossEntropyLoss
|
||||
from mindformers.parallel_core.training_graph.transformer.multi_token_prediction import MultiTokenPredictionBlock
|
||||
from mindformers.parallel_core.training_graph.transformer.multi_token_prediction import MultiTokenPredictionBlock, \
|
||||
func_infer_dtype, func_infer_shape, func_infer_shape_labels_and_masks
|
||||
from mindformers.parallel_core.training_graph.device_matrix import layout
|
||||
from mindformers.parallel_core.utils.spec_utils import ModuleSpec
|
||||
from mindformers.parallel_core.training_graph.transformer.mask_generate import CausalMaskGenerate
|
||||
@@ -56,26 +57,6 @@ from mindformers.version_control import get_lazy_inline as lazy_inline
|
||||
from mindformers.core.optim.muon_utils import make_muon_fns
|
||||
|
||||
|
||||
def func_infer_dtype(*args):
|
||||
"""infer_dtype for Morph."""
|
||||
return args[0]
|
||||
|
||||
|
||||
def func_infer_shape(*args):
|
||||
"""infer_shape for Morph."""
|
||||
input_shape = args[0]
|
||||
shape_value = np.prod(input_shape[:-1])
|
||||
output_shape = [int(shape_value), args[0][-1]]
|
||||
return output_shape
|
||||
|
||||
def func_infer_shape_labels_and_masks(*args):
|
||||
"""infer_shape for Morph."""
|
||||
input_shape = args[0]
|
||||
shape_value = np.prod(input_shape)
|
||||
output_shape = [int(shape_value)]
|
||||
return output_shape
|
||||
|
||||
|
||||
class PreprocessLabelsAndMasks(nn.Cell):
|
||||
"""Preprocess input_ids and generate labels and masks.
|
||||
"""
|
||||
|
||||
@@ -40,26 +40,23 @@ _device_local_loss = {}
|
||||
|
||||
def get_device_local_loss(tag="lm"):
|
||||
"""Get `_device_local_loss` Parameter after init"""
|
||||
global _device_local_loss
|
||||
if tag is None:
|
||||
return _device_local_loss
|
||||
if _device_local_loss.get(tag, None) is None:
|
||||
_device_local_loss[tag] = Parameter(
|
||||
Tensor([0.0], mstype.float32), name=f"_device_local_loss", requires_grad=False
|
||||
Tensor([0.0], mstype.float32), name="_device_local_loss", requires_grad=False
|
||||
)
|
||||
return _device_local_loss[tag]
|
||||
|
||||
|
||||
def reset_device_local_loss():
|
||||
"""Reset `_device_local_loss` parameter to zero"""
|
||||
global _device_local_loss
|
||||
for _, loss in _device_local_loss.items():
|
||||
F.assign(loss, Tensor([0.0], mstype.float32))
|
||||
|
||||
|
||||
def check_device_local_loss():
|
||||
"""check if Nan or Inf in `_device_local_loss` parameter then terminate training"""
|
||||
global _device_local_loss
|
||||
if not _device_local_loss:
|
||||
return
|
||||
for tag, device_local_loss in _device_local_loss.items():
|
||||
@@ -88,7 +85,7 @@ class _LogSoftmax(nn.Cell):
|
||||
The corresponding log softmax results.
|
||||
"""
|
||||
def __init__(self, config: TransformerConfig = default_transformer_config):
|
||||
super(_LogSoftmax, self).__init__()
|
||||
super().__init__()
|
||||
dp = config.data_parallel_size
|
||||
mp = config.tensor_model_parallel_size
|
||||
cp = config.context_parallel_size
|
||||
@@ -143,7 +140,7 @@ class _NLLLoss(nn.Cell):
|
||||
The corresponding loss results.
|
||||
"""
|
||||
def __init__(self, config: TransformerConfig = default_transformer_config):
|
||||
super(_NLLLoss, self).__init__()
|
||||
super().__init__()
|
||||
dp = config.data_parallel_size
|
||||
mp = config.tensor_model_parallel_size
|
||||
cp = config.context_parallel_size
|
||||
@@ -176,7 +173,7 @@ class _NLLLoss(nn.Cell):
|
||||
|
||||
|
||||
class CrossEntropyLoss(nn.Cell):
|
||||
r"""
|
||||
"""
|
||||
Calculate the cross entropy loss.
|
||||
|
||||
CrossEntropyLoss supports two different types of targets:
|
||||
@@ -185,9 +182,9 @@ class CrossEntropyLoss(nn.Cell):
|
||||
When reduction is set to 'none', the cross-entropy loss is computed as follows:
|
||||
|
||||
.. math::
|
||||
\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
|
||||
l_n = - w_{y_n} \log \frac{\exp(x_{n,y_n})}{\sum_{c=1}^C \exp(x_{n,c})}
|
||||
\cdot \mathbb{1}\{y_n \not= \text{ignore_index}\}
|
||||
\\ell(x, y) = L = \\{l_1,\\dots,l_N\\}^\top, \\quad
|
||||
l_n = - w_{y_n} \\log \\frac{\\exp(x_{n,y_n})}{\\sum_{c=1}^C \\exp(x_{n,c})}
|
||||
\\cdot \\mathbb{1}\\{y_n \\not= \\text{ignore_index}\\}
|
||||
|
||||
where :math:`x` denotes the predicted values, :math:`t` denotes the target values, :math:`w` denotes the weights,
|
||||
and :math:`N` is the batch size. The index :math:`c` ranges from [0, C-1], representing the class indices,
|
||||
@@ -196,19 +193,19 @@ class CrossEntropyLoss(nn.Cell):
|
||||
If reduction is not set to 'none' (the default is 'mean'), the loss is computed as:
|
||||
|
||||
.. math::
|
||||
\ell(x, y) = \begin{cases}
|
||||
\sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n} \cdot \mathbb{1}\{y_n \not= \text{ignore_index}\}} l_n, &
|
||||
\text{if reduction} = \text{'mean',}\\
|
||||
\sum_{n=1}^N l_n, &
|
||||
\text{if reduction} = \text{'sum'.}
|
||||
\end{cases}
|
||||
\\ell(x, y) = \\begin{cases}
|
||||
\\sum_{n=1}^N \\frac{1}{\\sum_{n=1}^N w_{y_n} \\cdot \\mathbb{1}\\{y_n \\not=
|
||||
\\text{ignore_index}\\}} l_n, &\\text{if reduction} = \\text{'mean',}\\\\
|
||||
\\sum_{n=1}^N l_n, &
|
||||
\\text{if reduction} = \\text{'sum'.}
|
||||
\\end{cases}
|
||||
|
||||
- Class probabilities (float), used when the target is a probability distribution over multiple class labels.
|
||||
When reduction is set to 'none', the cross-entropy loss is computed as follows:
|
||||
|
||||
.. math::
|
||||
\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
|
||||
l_n = - \sum_{c=1}^C w_c \log \frac{\exp(x_{n,c})}{\sum_{i=1}^C \exp(x_{n,i})} y_{n,c}
|
||||
\\ell(x, y) = L = \\{l_1,\\dots,l_N\\}^\\top, \\quad
|
||||
l_n = - \\sum_{c=1}^C w_c \\log \\frac{\\exp(x_{n,c})}{\\sum_{i=1}^C \\exp(x_{n,i})} y_{n,c}
|
||||
|
||||
where :math:`x` denotes the predicted values, :math:`t` denotes the target values, :math:`w` denotes the weights,
|
||||
and :math:`N` is the batch size. The index :math:`c` ranges from [0, C-1], representing the class indices,
|
||||
@@ -217,12 +214,12 @@ class CrossEntropyLoss(nn.Cell):
|
||||
If reduction is not set to 'none' (the default is 'mean'), the loss is computed as:
|
||||
|
||||
.. math::
|
||||
\ell(x, y) = \begin{cases}
|
||||
\frac{\sum_{n=1}^N l_n}{N}, &
|
||||
\text{if reduction} = \text{'mean',}\\
|
||||
\sum_{n=1}^N l_n, &
|
||||
\text{if reduction} = \text{'sum'.}
|
||||
\end{cases}
|
||||
\\ell(x, y) = \\begin{cases}
|
||||
\\frac{\\sum_{n=1}^N l_n}{N}, &
|
||||
\\text{if reduction} = \\text{'mean',}\\\\
|
||||
\\sum_{n=1}^N l_n, &
|
||||
\\text{if reduction} = \\text{'sum'.}
|
||||
\\end{cases}
|
||||
|
||||
Args:
|
||||
config (TransformerConfig): The parallel configuration. Default: default_transformer_config,
|
||||
@@ -258,7 +255,7 @@ class CrossEntropyLoss(nn.Cell):
|
||||
@_LogActionOnce(m_logger=logger, key='CrossEntropyLoss',
|
||||
no_warning=_get_parallel_mode() in (ParallelMode.STAND_ALONE,))
|
||||
def __init__(self, config: TransformerConfig = default_transformer_config, loss_tag='lm', **kwargs):
|
||||
super(CrossEntropyLoss, self).__init__()
|
||||
super().__init__()
|
||||
dp = config.data_parallel_size
|
||||
mp = config.tensor_model_parallel_size
|
||||
cp = config.context_parallel_size
|
||||
@@ -347,7 +344,7 @@ class VocabParallelCrossEntropy(nn.Cell):
|
||||
"""calculate cross entropy loss"""
|
||||
|
||||
def __init__(self, config: TransformerConfig = default_transformer_config, **kwargs):
|
||||
super(VocabParallelCrossEntropy, self).__init__()
|
||||
super().__init__()
|
||||
self.cross_entropy = CrossEntropyLoss(config, **kwargs)
|
||||
|
||||
def construct(self, vocab_parallel_logits, target, input_mask=None, label_smoothing=None):
|
||||
|
||||
@@ -36,7 +36,7 @@ def get_strategy(config: TransformerConfig):
|
||||
|
||||
|
||||
class LayerNorm(nn.Cell):
|
||||
r"""
|
||||
"""
|
||||
Layer norm operation.
|
||||
|
||||
Args:
|
||||
@@ -52,7 +52,7 @@ class LayerNorm(nn.Cell):
|
||||
"""
|
||||
|
||||
def __init__(self, config, dim, eps=1e-5):
|
||||
super(LayerNorm, self).__init__()
|
||||
super().__init__()
|
||||
self.params_dtype = config.params_dtype
|
||||
self.compute_type = config.layernorm_compute_dtype
|
||||
|
||||
@@ -117,7 +117,7 @@ class LayerNorm(nn.Cell):
|
||||
|
||||
|
||||
class FusedLayerNorm(nn.Cell):
|
||||
r"""
|
||||
"""
|
||||
Layer norm operation.
|
||||
|
||||
Args:
|
||||
@@ -133,7 +133,7 @@ class FusedLayerNorm(nn.Cell):
|
||||
"""
|
||||
|
||||
def __init__(self, config, dim, eps=1e-5):
|
||||
super(FusedLayerNorm, self).__init__()
|
||||
super().__init__()
|
||||
self.params_dtype = config.params_dtype
|
||||
self.compute_type = config.layernorm_compute_dtype
|
||||
|
||||
@@ -170,8 +170,7 @@ class FusedLayerNorm(nn.Cell):
|
||||
strategy = (cp, dp, 1)
|
||||
|
||||
if strategy[-1] != 1:
|
||||
raise TypeError(
|
||||
'The last dim in FusedLayerNorm can not equal to 1! Strategy {} not supported!'.format(strategy))
|
||||
raise TypeError(f'The last dim in FusedLayerNorm can not equal to 1! Strategy {strategy} not supported!')
|
||||
|
||||
self.layer_norm.shard((strategy, (strategy[-1],), (strategy[-1],)))
|
||||
|
||||
@@ -180,7 +179,7 @@ class FusedLayerNorm(nn.Cell):
|
||||
|
||||
|
||||
class RMSNorm(nn.Cell):
|
||||
r"""
|
||||
"""
|
||||
A self-defined RMSNorm operation using reduce mean.
|
||||
|
||||
Args:
|
||||
@@ -196,7 +195,7 @@ class RMSNorm(nn.Cell):
|
||||
"""
|
||||
|
||||
def __init__(self, config, dim, eps=1e-6):
|
||||
super(RMSNorm, self).__init__()
|
||||
super().__init__()
|
||||
self.params_dtype = config.params_dtype
|
||||
self.compute_type = config.layernorm_compute_dtype
|
||||
|
||||
@@ -251,7 +250,7 @@ class RMSNorm(nn.Cell):
|
||||
|
||||
|
||||
class FusedRMSNorm(nn.Cell):
|
||||
r"""
|
||||
"""
|
||||
FusedRMSNorm operation
|
||||
|
||||
Args:
|
||||
@@ -267,7 +266,7 @@ class FusedRMSNorm(nn.Cell):
|
||||
"""
|
||||
|
||||
def __init__(self, config, dim, eps=1e-6):
|
||||
super(FusedRMSNorm, self).__init__()
|
||||
super().__init__()
|
||||
self.params_dtype = config.params_dtype
|
||||
self.compute_type = config.layernorm_compute_dtype
|
||||
|
||||
|
||||
@@ -77,7 +77,7 @@ ATTNMASK_FUNC_MAP = {
|
||||
|
||||
|
||||
def get_attn_mask_func(mask_func_type):
|
||||
r"""
|
||||
"""
|
||||
Get attention mask function.
|
||||
|
||||
Args:
|
||||
|
||||
@@ -207,7 +207,7 @@ class MFTrainOneStepCell(nn.TrainOneStepWithLossScaleCell):
|
||||
**kwargs (Any): Additional parameters.
|
||||
|
||||
Inputs:
|
||||
- **\*inputs** (Tuple(Tensor)) - Tuple of input tensors with shape :math:`(N, \ldots)`.
|
||||
- **\\*inputs** (Tuple(Tensor)) - Tuple of input tensors with shape :math:`(N, \\ldots)`.
|
||||
|
||||
Outputs:
|
||||
Tuple of 5 or 7 Tensor, the loss, overflow flag, current loss scale value, learning rate,
|
||||
|
||||
@@ -101,7 +101,7 @@ def is_not_compatibility(base_str, new_str):
|
||||
def set_failure_list(api_str, value, signature, failure_list):
|
||||
"""set failure info list"""
|
||||
failure_list.append(f"# {api_str}:")
|
||||
failure_list.append(f" - function signature is different: ")
|
||||
failure_list.append(" - function signature is different: ")
|
||||
failure_list.append(f" - the base signature is {value}.")
|
||||
failure_list.append(f" - now it is {signature}.")
|
||||
|
||||
@@ -170,12 +170,12 @@ def api_signature(obj, api_str, content, base_schema, failure_list, is_update=Fa
|
||||
else:
|
||||
tmp_len = -1
|
||||
signature = None
|
||||
for i in range(len(signature_list)):
|
||||
if signature_list[i] == "(*args, **kwargs)":
|
||||
for _, sig in enumerate(signature_list):
|
||||
if sig == "(*args, **kwargs)":
|
||||
continue
|
||||
if len(signature_list[i]) > tmp_len:
|
||||
tmp_len = len(signature_list[i])
|
||||
signature = signature_list[i]
|
||||
if len(sig) > tmp_len:
|
||||
tmp_len = len(sig)
|
||||
signature = sig
|
||||
else:
|
||||
signature = str(inspect.signature(obj))
|
||||
|
||||
@@ -293,7 +293,8 @@ class TestApiStability:
|
||||
def check_one_element(elem, mod_name, mod, is_public):
|
||||
obj = getattr(mod, elem)
|
||||
if hasattr(obj, "__module__"):
|
||||
if obj.__module__ not in ['sentencepiece_model_pb2']: # cannot use __import__ module list
|
||||
# cannot use __import__ module list
|
||||
if obj.__module__ not in ['sentencepiece_model_pb2', 'node_strategy_pb2']:
|
||||
mod_source = str(__import__(obj.__module__))
|
||||
if "mindformers" not in mod_source:
|
||||
return
|
||||
@@ -337,4 +338,4 @@ class TestApiStability:
|
||||
with open(self.api_json_path, "w", encoding="utf-8") as w:
|
||||
w.write(json.dumps(self.content, ensure_ascii=False, indent=4))
|
||||
|
||||
assert not self.is_update, f"self.is_update should be set to False"
|
||||
assert not self.is_update, "self.is_update should be set to False"
|
||||
|
||||
@@ -22,14 +22,14 @@ from mindspore.ops import operations as ops
|
||||
from mindspore.common.api import _cell_graph_executor
|
||||
|
||||
from mindformers.core import CrossEntropyLoss
|
||||
from mindformers.modules import FeedForward, FixedSparseAttention, LowerTriangularMaskWithDynamic
|
||||
from mindformers.modules import FixedSparseAttention, LowerTriangularMaskWithDynamic
|
||||
|
||||
|
||||
class MyActivation(mindspore.nn.Cell):
|
||||
"""An example of custom activation"""
|
||||
|
||||
def __init__(self):
|
||||
super(MyActivation, self).__init__()
|
||||
super().__init__()
|
||||
self.add = ops.Add()
|
||||
|
||||
def construct(self, x):
|
||||
@@ -43,27 +43,13 @@ class MyActivationNoShard(mindspore.nn.Cell):
|
||||
"""An example of custom activation without shard"""
|
||||
|
||||
def __init__(self):
|
||||
super(MyActivationNoShard, self).__init__()
|
||||
super().__init__()
|
||||
self.add = ops.Add()
|
||||
|
||||
def construct(self, x):
|
||||
return self.add(x, 0.1)
|
||||
|
||||
|
||||
def test_feedforward():
|
||||
"""
|
||||
Feature: Feedforward
|
||||
Description: Test Feedforward module
|
||||
Expectation: No exception
|
||||
"""
|
||||
model = FeedForward(hidden_size=15,
|
||||
ffn_hidden_size=30,
|
||||
dropout_rate=0.1,
|
||||
hidden_act='relu')
|
||||
tensor = Tensor(np.ones((2, 20, 15)), dtype.float32)
|
||||
_cell_graph_executor.compile(model, tensor)
|
||||
|
||||
|
||||
def test_cross_entropy_loss():
|
||||
"""
|
||||
Feature: CrossEntropyLoss
|
||||
|
||||
Reference in New Issue
Block a user