mirror of
https://gitee.com/mindspore/mindformers.git
synced 2025-12-06 11:29:59 +08:00
!7787 【master】【mcore】【bugfix】Fix the incorrect path in the YAML file and Fix…
Merge pull request !7787 from zhangyihui/master-bugfix
This commit is contained in:
@@ -136,7 +136,7 @@ train_dataset: &train_dataset
|
||||
|
||||
#### 3. 启动预训练任务
|
||||
|
||||
通过指定模型路径和配置文件[configs/qwen3/pretrain_qwen3_30b_a3b_4k.yaml](https://gitee.com/mindspore/mindformers/blob/master/configs/qwen3_moe/pretrain_qwen3_30b_a3b_4k.yaml)以msrun的方式启动[run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/master/run_mindformer.py)脚本,进行16卡分布式训练。可以参考如下方式拉起两台Atlas 800T A2(64G)训练。
|
||||
通过指定模型路径和配置文件[configs/qwen3_moe/pretrain_qwen3_30b_a3b_4k.yaml](https://gitee.com/mindspore/mindformers/blob/master/configs/qwen3_moe/pretrain_qwen3_30b_a3b_4k.yaml)以`msrun`的方式启动[run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/master/run_mindformer.py)脚本,进行16卡分布式训练。您可参考如下方式,拉起两台Atlas 800T A2(64G)训练。
|
||||
|
||||
在每台服务器上执行如下命令。设置`master_ip`为主节点IP地址,即`Rank 0`服务器的IP;`node_rank`为每个节点的序号;`port`为当前进程的端口号(可在50000~65536中选择)。
|
||||
|
||||
|
||||
@@ -3,10 +3,10 @@ mindformers.models.LlamaForCausalLM
|
||||
|
||||
.. py:class:: mindformers.models.LlamaForCausalLM(config=None)
|
||||
|
||||
在线计算并提供执行LLama训练时的损失值和逻辑值。
|
||||
在线计算并提供执行Llama训练时的损失值和逻辑值。
|
||||
|
||||
参数:
|
||||
- **config** (LlamaConfig, 可选) - LLama模型的配置。默认值: ``None`` 。
|
||||
- **config** (LlamaConfig, 可选) - Llama模型的配置。默认值: ``None`` 。
|
||||
|
||||
输入:
|
||||
- **input_ids** (Tensor) - 数据类型为Int64/Int32的词汇表中输入序列标记的索引,张量的形状为::math:`(batch, seq\_length)`。
|
||||
|
||||
@@ -3,7 +3,7 @@ mindformers.tools.MindFormerConfig
|
||||
|
||||
.. py:class:: mindformers.tools.MindFormerConfig(*args, **kwargs)
|
||||
|
||||
一个配置的类,继承于Python的dict类。可以解析来自yaml文件或dict实例的配置参数。
|
||||
一个配置类,继承于Python的dict类。可以解析来自yaml文件或dict实例的配置参数。
|
||||
|
||||
参数:
|
||||
- **args** (Any) - 可扩展参数列表,可以是yaml配置文件路径或配置字典。
|
||||
|
||||
@@ -111,22 +111,22 @@ class LlamaModel(LlamaPreTrainedModel):
|
||||
else:
|
||||
logger.info("MoE config is None, use normal FFN")
|
||||
if not self.use_flash_attention and self.use_ring_attention:
|
||||
raise ValueError(f"When the ring_attention = True, the flash_attention must be True.")
|
||||
raise ValueError("When the ring_attention = True, the flash_attention must be True.")
|
||||
if not self.use_flash_attention and self.use_eod_attn_mask_compression:
|
||||
raise ValueError(f"When the use_eod_attn_mask_compression = True, the flash_attention must be True.")
|
||||
raise ValueError("When the use_eod_attn_mask_compression = True, the flash_attention must be True.")
|
||||
self.seq_split_num = config.parallel_config.seq_split_num
|
||||
self.seq_pipe = self.seq_split_num > 1
|
||||
if self.seq_pipe:
|
||||
dp = config.parallel_config.data_parallel
|
||||
if self.use_ring_attention:
|
||||
raise ValueError(f"When the seq_pipe = True, the use_ring_attention cannot be True.")
|
||||
raise ValueError("When the seq_pipe = True, the use_ring_attention cannot be True.")
|
||||
if config.use_attn_mask_compression and not check_seqpp_fa_opt_support():
|
||||
raise ValueError(f"Currently, when the seq_pipe = True, "
|
||||
f"use_attn_mask_compress must be False with mindspore < 2.6.0. "
|
||||
f"If you want to enable it, please upgrade mindspore to 2.6.0 or later.")
|
||||
raise ValueError("Currently, when the seq_pipe = True, "
|
||||
"use_attn_mask_compress must be False with mindspore < 2.6.0. "
|
||||
"If you want to enable it, please upgrade mindspore to 2.6.0 or later.")
|
||||
if config.use_eod_attn_mask_compression:
|
||||
raise ValueError(f"Currently, when the seq_pipe = True, "
|
||||
f"use_eod_attn_mask_compression cannot be True.")
|
||||
raise ValueError("Currently, when the seq_pipe = True, "
|
||||
"use_eod_attn_mask_compression cannot be True.")
|
||||
self.n_kv_head = self.n_head if config.n_kv_heads is None else config.n_kv_heads
|
||||
kv_shape = (config.batch_size * dp, self.n_kv_head, config.seq_length, self.head_dim)
|
||||
self.zeros = initializer('zeros', kv_shape, dtype=self.dtype)
|
||||
@@ -430,10 +430,10 @@ class LlamaModel(LlamaPreTrainedModel):
|
||||
@MindFormerRegister.register(MindFormerModuleType.MODELS)
|
||||
class LlamaForCausalLM(LlamaPreTrainedModel):
|
||||
r"""
|
||||
Provide llama training loss or logits through network.
|
||||
Provide Llama training loss or logits through network.
|
||||
|
||||
Args:
|
||||
config (LlamaConfig, optional): The config of llama model. Default: `None` .
|
||||
config (LlamaConfig, optional): The config of Llama model. Default: `None` .
|
||||
|
||||
Inputs:
|
||||
- **input_ids** (Tensor) - the indices of input sequence tokens in the vocabulary with data type Int64/Int32,
|
||||
@@ -485,7 +485,7 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
|
||||
|
||||
@lazy_inline
|
||||
def __init__(self, config: LlamaConfig = None):
|
||||
super(LlamaForCausalLM, self).__init__(config, auto_prefix=True)
|
||||
super().__init__(config, auto_prefix=True)
|
||||
_check_config(config.parallel_config)
|
||||
self.config = config
|
||||
self.ignore_token_id = config.ignore_token_id
|
||||
@@ -507,7 +507,7 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
|
||||
self.prefill_gather_flatten = P.Gather()
|
||||
self.sub_batch_valid_len = P.Sub()
|
||||
self.predict_run_mode = get_predict_run_mode()
|
||||
logger.info("Predict run mode: {}".format(self.predict_run_mode))
|
||||
logger.info(f"Predict run mode: {self.predict_run_mode}")
|
||||
if self.predict_run_mode and self.config.is_dynamic:
|
||||
logger.info("use_flash_attention is set to True when run_mode is predict and is_dynamic is True.")
|
||||
self.config.use_flash_attention = True
|
||||
@@ -807,7 +807,7 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
|
||||
|
||||
@classmethod
|
||||
def obtain_name_map(cls, load_checkpoint_files):
|
||||
name_map = dict()
|
||||
name_map = {}
|
||||
for checkpoint_file in load_checkpoint_files:
|
||||
with safe_open(checkpoint_file, framework="np") as f:
|
||||
for k in f.keys():
|
||||
@@ -829,6 +829,7 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
|
||||
return params
|
||||
|
||||
|
||||
# pylint: disable=C0415
|
||||
def _concat_qkv_weight(wq_keys, wk_keys, wv_keys, model_config, qkv_dict, condition, target_dict):
|
||||
"""concat qkv weight from dicts"""
|
||||
from mindformers.utils.convert_utils import qkv_concat_hf2mg
|
||||
@@ -876,6 +877,7 @@ def _concat_qkv_weight(wq_keys, wk_keys, wv_keys, model_config, qkv_dict, condit
|
||||
target_dict.update({w_qkv_key: w_qkv_value_mg})
|
||||
|
||||
|
||||
# pylint: disable=C0415
|
||||
def _concat_ffn_weight(w1_keys, w3_keys, model_config, qkv_dict, condition, target_dict):
|
||||
"""concat ffn weight from dicts"""
|
||||
from mindformers.utils.convert_utils import ffn_concat_hf2mg
|
||||
|
||||
@@ -50,10 +50,10 @@ from mindformers.generation.utils import convert_pin
|
||||
@MindFormerRegister.register(MindFormerModuleType.MODELS)
|
||||
class ParallelLlamaForCausalLM(LlamaPreTrainedModel):
|
||||
r"""
|
||||
Provide llama training loss or logits through network.
|
||||
Provide Llama training loss or logits through network.
|
||||
|
||||
Args:
|
||||
config (LlamaConfig): The config of llama model.
|
||||
config (LlamaConfig): The config of Llama model.
|
||||
|
||||
Returns:
|
||||
output: Tensor, the output of llama decoderlayer
|
||||
|
||||
Reference in New Issue
Block a user