!3020 [pytorch][bugfix]fix the stuck problem of dense+moe mixed structure model

Merge pull request !3020 from sunjunjie/2.1.0
2025-12-06 11:28:59 +08:00 · 2025-07-21 02:56:54 +00:00
parent a840be21ef
commit 0d412be9c5
1 changed files with 3 additions and 0 deletions
--- a/mindspeed_llm/training/arguments.py
+++ b/mindspeed_llm/training/arguments.py
@@ -1120,6 +1120,9 @@ def _validate_transformer_block_build_layers(args):
        if args.first_k_dense_replace and args.num_layers <= args.first_k_dense_replace:
            raise AssertionError('Num-layer ({}) must be greater than first-k-dense-replace ({}) when first-k-dense-replace is set.'.format(args.num_layers,
            args.first_k_dense_replace))
+        if args.first_k_dense_replace and args.pipeline_model_parallel_size > 1:
+            if args.first_k_dense_replace >= args.num_layers // args.pipeline_model_parallel_size:
+                raise AssertionError('When using first-k-dense-replace, it is not allowed for all layers within a pp stage to be dense layers.')
    if args.num_experts is not None and args.use_mc2 and args.moe_grouped_gemm:
        raise AssertionError('Moe Grouped Gemm is not supported with mc2 in MOE model.')