!7787 【master】【mcore】【bugfix】Fix the incorrect path in the YAML file and Fix…

Merge pull request !7787 from zhangyihui/master-bugfix
2025-12-06 11:29:59 +08:00 · 2025-12-03 14:42:12 +00:00
parent 2c22389aa4 d261447482
commit 5cde08f9cc
5 changed files with 21 additions and 19 deletions
--- a/configs/qwen3_moe/README.md
+++ b/configs/qwen3_moe/README.md
@@ -136,7 +136,7 @@ train_dataset: &train_dataset

 #### 3. 启动预训练任务

-通过指定模型路径和配置文件[configs/qwen3/pretrain_qwen3_30b_a3b_4k.yaml](https://gitee.com/mindspore/mindformers/blob/master/configs/qwen3_moe/pretrain_qwen3_30b_a3b_4k.yaml)以msrun的方式启动[run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/master/run_mindformer.py)脚本，进行16卡分布式训练。可以参考如下方式拉起两台Atlas 800T A2（64G）训练。
+通过指定模型路径和配置文件[configs/qwen3_moe/pretrain_qwen3_30b_a3b_4k.yaml](https://gitee.com/mindspore/mindformers/blob/master/configs/qwen3_moe/pretrain_qwen3_30b_a3b_4k.yaml)以`msrun`的方式启动[run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/master/run_mindformer.py)脚本，进行16卡分布式训练。您可参考如下方式，拉起两台Atlas 800T A2（64G）训练。

 在每台服务器上执行如下命令。设置`master_ip`为主节点IP地址，即`Rank 0`服务器的IP；`node_rank`为每个节点的序号；`port`为当前进程的端口号（可在50000~65536中选择）。

--- a/docs/api/api_python/models/mindformers.models.LlamaForCausalLM.rst
+++ b/docs/api/api_python/models/mindformers.models.LlamaForCausalLM.rst
@@ -3,10 +3,10 @@ mindformers.models.LlamaForCausalLM

 .. py:class:: mindformers.models.LlamaForCausalLM(config=None)

-    在线计算并提供执行LLama训练时的损失值和逻辑值。
+    在线计算并提供执行Llama训练时的损失值和逻辑值。

    参数：
-        - **config** (LlamaConfig, 可选) - LLama模型的配置。默认值： ``None`` 。
+        - **config** (LlamaConfig, 可选) - Llama模型的配置。默认值： ``None`` 。

    输入：
        - **input_ids** (Tensor) - 数据类型为Int64/Int32的词汇表中输入序列标记的索引，张量的形状为：:math:`(batch, seq\_length)`。
--- a/docs/api/api_python/tools/mindformers.tools.MindFormerConfig.rst
+++ b/docs/api/api_python/tools/mindformers.tools.MindFormerConfig.rst
@@ -3,7 +3,7 @@ mindformers.tools.MindFormerConfig

 .. py:class:: mindformers.tools.MindFormerConfig(*args, **kwargs)

-    一个配置的类，继承于Python的dict类。可以解析来自yaml文件或dict实例的配置参数。
+    一个配置类，继承于Python的dict类。可以解析来自yaml文件或dict实例的配置参数。

    参数：
        - **args** (Any) - 可扩展参数列表，可以是yaml配置文件路径或配置字典。
--- a/mindformers/models/llama/llama.py
+++ b/mindformers/models/llama/llama.py
@@ -111,22 +111,22 @@ class LlamaModel(LlamaPreTrainedModel):
        else:
            logger.info("MoE config is None, use normal FFN")
        if not self.use_flash_attention and self.use_ring_attention:
-            raise ValueError(f"When the ring_attention = True, the flash_attention must be True.")
+            raise ValueError("When the ring_attention = True, the flash_attention must be True.")
        if not self.use_flash_attention and self.use_eod_attn_mask_compression:
-            raise ValueError(f"When the use_eod_attn_mask_compression = True, the flash_attention must be True.")
+            raise ValueError("When the use_eod_attn_mask_compression = True, the flash_attention must be True.")
        self.seq_split_num = config.parallel_config.seq_split_num
        self.seq_pipe = self.seq_split_num > 1
        if self.seq_pipe:
            dp = config.parallel_config.data_parallel
            if self.use_ring_attention:
-                raise ValueError(f"When the seq_pipe = True, the use_ring_attention cannot be True.")
+                raise ValueError("When the seq_pipe = True, the use_ring_attention cannot be True.")
            if config.use_attn_mask_compression and not check_seqpp_fa_opt_support():
-                raise ValueError(f"Currently, when the seq_pipe = True, "
-                                 f"use_attn_mask_compress must be False with mindspore < 2.6.0. "
-                                 f"If you want to enable it, please upgrade mindspore to 2.6.0 or later.")
+                raise ValueError("Currently, when the seq_pipe = True, "
+                                 "use_attn_mask_compress must be False with mindspore < 2.6.0. "
+                                 "If you want to enable it, please upgrade mindspore to 2.6.0 or later.")
            if config.use_eod_attn_mask_compression:
-                raise ValueError(f"Currently, when the seq_pipe = True, "
-                                 f"use_eod_attn_mask_compression cannot be True.")
+                raise ValueError("Currently, when the seq_pipe = True, "
+                                 "use_eod_attn_mask_compression cannot be True.")
            self.n_kv_head = self.n_head if config.n_kv_heads is None else config.n_kv_heads
            kv_shape = (config.batch_size * dp, self.n_kv_head, config.seq_length, self.head_dim)
            self.zeros = initializer('zeros', kv_shape, dtype=self.dtype)
@@ -430,10 +430,10 @@ class LlamaModel(LlamaPreTrainedModel):
@MindFormerRegister.register(MindFormerModuleType.MODELS)
 class LlamaForCausalLM(LlamaPreTrainedModel):
    r"""
-    Provide llama training loss or logits through network.
+    Provide Llama training loss or logits through network.

    Args:
-        config (LlamaConfig, optional): The config of llama model. Default: `None` .
+        config (LlamaConfig, optional): The config of Llama model. Default: `None` .

    Inputs:
        - **input_ids** (Tensor) - the indices of input sequence tokens in the vocabulary with data type Int64/Int32,
@@ -485,7 +485,7 @@ class LlamaForCausalLM(LlamaPreTrainedModel):

    @lazy_inline
    def __init__(self, config: LlamaConfig = None):
-        super(LlamaForCausalLM, self).__init__(config, auto_prefix=True)
+        super().__init__(config, auto_prefix=True)
        _check_config(config.parallel_config)
        self.config = config
        self.ignore_token_id = config.ignore_token_id
@@ -507,7 +507,7 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
        self.prefill_gather_flatten = P.Gather()
        self.sub_batch_valid_len = P.Sub()
        self.predict_run_mode = get_predict_run_mode()
-        logger.info("Predict run mode: {}".format(self.predict_run_mode))
+        logger.info(f"Predict run mode: {self.predict_run_mode}")
        if self.predict_run_mode and self.config.is_dynamic:
            logger.info("use_flash_attention is set to True when run_mode is predict and is_dynamic is True.")
            self.config.use_flash_attention = True
@@ -807,7 +807,7 @@ class LlamaForCausalLM(LlamaPreTrainedModel):

    @classmethod
    def obtain_name_map(cls, load_checkpoint_files):
-        name_map = dict()
+        name_map = {}
        for checkpoint_file in load_checkpoint_files:
            with safe_open(checkpoint_file, framework="np") as f:
                for k in f.keys():
@@ -829,6 +829,7 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
        return params


+# pylint: disable=C0415
 def _concat_qkv_weight(wq_keys, wk_keys, wv_keys, model_config, qkv_dict, condition, target_dict):
    """concat qkv weight from dicts"""
    from mindformers.utils.convert_utils import qkv_concat_hf2mg
@@ -876,6 +877,7 @@ def _concat_qkv_weight(wq_keys, wk_keys, wv_keys, model_config, qkv_dict, condit
        target_dict.update({w_qkv_key: w_qkv_value_mg})


+# pylint: disable=C0415
 def _concat_ffn_weight(w1_keys, w3_keys, model_config, qkv_dict, condition, target_dict):
    """concat ffn weight from dicts"""
    from mindformers.utils.convert_utils import ffn_concat_hf2mg
--- a/research/llama3_1/llama.py
+++ b/research/llama3_1/llama.py
@@ -50,10 +50,10 @@ from mindformers.generation.utils import convert_pin
@MindFormerRegister.register(MindFormerModuleType.MODELS)
 class ParallelLlamaForCausalLM(LlamaPreTrainedModel):
    r"""
-    Provide llama training loss or logits through network.
+    Provide Llama training loss or logits through network.

    Args:
-        config (LlamaConfig): The config of llama model.
+        config (LlamaConfig): The config of Llama model.

    Returns:
        output: Tensor, the output of llama decoderlayer