!3113 [pytorch][bugfix]fix some bug for icsl

Merge pull request !3113 from jzh/210_uicsl
This commit is contained in:
jzh
2025-08-08 07:13:20 +00:00
committed by i-robot
parent a4567e5b65
commit 6785f325f3
49 changed files with 225 additions and 102 deletions

View File

@@ -46,7 +46,9 @@
1. 建议用户结合运行资源状况编写对应训练脚本。若训练脚本与资源状况不匹配,如数据集加载内存大小超出内存容量限制、训练脚本在本地生成数据超过磁盘空间大小等情况,可能引发错误并导致进程意外退出。
2. MindSpeed-LLM内部用到了pytorch,可能会因为版本不匹配导致运行错误具体可参考pytorch[安全声明](https://gitee.com/ascend/pytorch#%E5%AE%89%E5%85%A8%E5%A3%B0%E6%98%8E)。
3. 出于安全考虑torch.load目前加载都采用了weights_only=True的配置请用户根据需要进行配置调整。
4. HumanEval使用了subprocess.run,存在安全风险,为了不影响功能正常使用,做了一些安全校验规避,请用户根据需要自行构建黑名单,完善安全问题。
5. 因为安全问题需要设置trust_remote_code=False无法远程加载Transformer官方仓库未支持的开源模型如需要请手动配置--trust-remote-code参量。
## 公网地址声明

View File

@@ -0,0 +1,9 @@
[
r"os\.(system|popen|exec|setuid|setgid|chroot)\s*\(",
r"subprocess\.(run|Popen|call)\s*\(",
r"pty\.spawn\s*\(",
r"(requests|urllib|socket|httpx)\.(get|post|urlopen|connect)\s*\(",
r"open\s*\(",
r"os\.(remove|rename|chmod|chown|mkdir)\s*\(",
r"(eval|exec|__import__|globals|locals)\s*\("
]

View File

@@ -76,6 +76,10 @@ def main():
help='Enable only save lora-checkpoint to hf')
parser.add_argument('--load-checkpoint-loosely', action='store_true', default=False,
help='Enable loading checkpoint not strictly.')
parser.add_argument('--trust-remote-code',
action='store_true',
default=False,
help='enable trust-remote-code for transformer to load model')
known_args, _ = parser.parse_known_args()

View File

@@ -380,7 +380,7 @@ def main():
model_provider=model_provider,
pretrained_model_name_or_path=args.load
)
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path, trust_remote_code=True, local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path, trust_remote_code=False, local_files_only=True)
rank = dist.get_rank()
if 'cmmlu' in args.task:

View File

@@ -12,7 +12,7 @@ import safetensors
import torch
import safetensors.torch
import bitsandbytes as bnb
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
logger.basicConfig(format="")
logger.getLogger().setLevel(logger.INFO)
@@ -75,8 +75,8 @@ class CkptConvert(object):
self.vpp_stage = vpp_stage
if vpp_stage is not None:
self.vpp_size = self.num_layers // self.pp_size // self.vpp_stage
self.hf_model_path = hf_model_path
self.mg_save_path = mg_save_path
self.hf_model_path = standardize_path(hf_model_path, check_read=True)
self.mg_save_path = standardize_path(mg_save_path, check_write=True)
self.num_layer_list = num_layer_list
self.noop_layers = noop_layers
self.moe_grouped_gemm = moe_grouped_gemm
@@ -138,7 +138,7 @@ class CkptConvert(object):
"""megatron model path"""
iter_mg_path = os.path.join(mg_path, "iter_0000001")
if not os.path.exists(mg_path):
os.makedirs(mg_path, exist_ok=True)
os.makedirs(mg_path, mode=0o750, exist_ok=True)
with open(os.path.join(mg_path, "latest_checkpointed_iteration.txt"), 'w') as f:
f.write("1")
@@ -786,7 +786,7 @@ class CkptConvert(object):
for tp_rank in range(self.tp_size):
save_prefix = self.generate_mg_weights_dir(tp_rank=tp_rank, pp_rank=pp_rank, ep_rank=ep_rank)
parallel_save_path = os.path.join(save_model_path, save_prefix)
os.makedirs(parallel_save_path)
os.makedirs(parallel_save_path, mode=0o750, exist_ok=True)
save_file_name = os.path.join(parallel_save_path, "model_optim_rng.pt")
logger.info(f"Saving to {save_file_name}")
@@ -845,7 +845,7 @@ class CkptConvert(object):
for tp_rank in range(self.tp_size):
save_prefix = self.generate_mg_weights_dir(tp_rank=tp_rank, pp_rank=pp_rank, ep_rank=ep_rank)
parallel_save_path = os.path.join(save_model_path, save_prefix)
os.makedirs(parallel_save_path, exist_ok=True)
os.makedirs(parallel_save_path, mode=0o750, exist_ok=True)
save_file_name = os.path.join(parallel_save_path, "model_optim_rng.pt")
logger.info(f"Saving to {save_file_name}")
model_dict = {"checkpoint_version": 3.0, "iteration": 1}

View File

@@ -14,7 +14,7 @@ import tqdm
import torch
import torch_npu
import safetensors.torch
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
logger.basicConfig(format="")
logger.getLogger().setLevel(logger.INFO)
@@ -33,7 +33,7 @@ GLOBAL_LM_HEAD_WEIGHTS = None
def load_data(file_path):
logger.info(f"Loading the checkpoint from {file_path}.")
return torch.load(file_path, map_location='cpu', weights_only=False)
return torch.load(file_path, map_location='cpu', weights_only=True)
def tensor_memory_size(tensor):
@@ -73,15 +73,15 @@ class MgCkptConvert(object):
self.ep_size = ep_size
self.vpp_stage = vpp_stage
self.mg_model_path = mg_model_path
self.hf_save_path = hf_save_path
self.mg_model_path = standardize_path(mg_model_path, check_read=True)
self.hf_save_path = standardize_path(hf_save_path, check_write=True)
self.lora_model_path = lora_model_path
self.iter_path = self.get_iter_path(self.mg_model_path)
if self.lora_model_path is not None:
self.lora_iter_path = self.get_iter_path(self.lora_model_path)
if not os.path.exists(self.hf_save_path):
os.makedirs(self.hf_save_path)
os.makedirs(self.hf_save_path, mode=0o750, exist_ok=True)
self.num_layers = num_layers
self.noop_layers = noop_layers
@@ -194,7 +194,7 @@ class MgCkptConvert(object):
directory = os.path.join(ckpt_path, f'iter_{iteration:07d}')
os.makedirs(directory, exist_ok=True)
os.makedirs(directory, mode=0o750, exist_ok=True)
return directory

View File

@@ -16,6 +16,7 @@ from megatron.core.datasets.gpt_dataset import (_build_document_index,
_build_shuffle_index
)
from mindspeed_llm.tasks.utils.error_utils import GPTDatasetSampleIndexError
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
from .blended_megatron_dataset_builder import need_to_build_dataset
logger = logging.getLogger(__name__)
@@ -65,11 +66,13 @@ def _build_document_sample_shuffle_indices(
Returns:
Tuple[numpy.ndarray, numpy.ndarray]: The document index, the sample index, and the shuffle index
"""
path_to_cache = self.config.path_to_cache
if path_to_cache is None and not self.config.mock:
path_to_cache = os.path.join(
self.dataset.path_prefix, "cache", f"{type(self).__name__}_indices"
)
path_to_cache = standardize_path(path_to_cache, check_write=True)
# start of megatron_adaptation,
# here we change from (class)GPTDataset._build_document_sample_shuffle_indices
@@ -196,8 +199,7 @@ def _build_document_sample_shuffle_indices(
)
if any(sample_index[:, 0] < 0):
_url = "https://gitee.com/ascend/MindSpeed-LLM/wikis/megatron%20data%20helpers%E5%8F%AF%E8%83%BD%E5%BC%95%E5%85%A5%E7%9A%84%E9%97%AE%E9%A2%98"
raise GPTDatasetSampleIndexError(f"Bad sample index. Visit {_url} for more information")
raise GPTDatasetSampleIndexError(f"Bad sample index.")
# Build the shuffle index
if separate_final_epoch:
@@ -210,7 +212,7 @@ def _build_document_sample_shuffle_indices(
)
if path_to_cache:
os.makedirs(path_to_cache, exist_ok=True)
os.makedirs(path_to_cache, mode=0o750, exist_ok=True)
# Write the description
with open(path_to_description, "wt") as writer:
writer.write(self.unique_description)
@@ -256,8 +258,7 @@ def _build_document_sample_shuffle_indices(
sample_index = numpy.load(path_to_sample_index, allow_pickle=True, mmap_mode='r')
if any(sample_index[:, 0] < 0):
_url = "https://gitee.com/ascend/MindSpeed-LLM/wikis/megatron%20data%20helpers%E5%8F%AF%E8%83%BD%E5%BC%95%E5%85%A5%E7%9A%84%E9%97%AE%E9%A2%98"
raise GPTDatasetSampleIndexError(f"Bad sample index. Visit {_url} for more information")
raise GPTDatasetSampleIndexError(f"Bad sample index.")
t_end = time.time()
log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")

View File

@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Expert parallel groups."""
import os
import sys
from functools import wraps
from typing import Optional
@@ -24,7 +24,7 @@ import torch_npu
import megatron
from megatron.core.parallel_state import get_context_parallel_world_size, get_nccl_options
from mindspeed.core.parallel_state import hccl_buffer_auto_adaptive, parse_hccl_buffer_string
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
_EXPERT_PARALLEL_GROUP = None
_MPU_EXPERT_MODEL_PARALLEL_RANK = None
_MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE = None
@@ -48,6 +48,8 @@ def initialize_model_parallel_decorator(initialize_model_parallel):
from megatron.training.utils import print_rank_0
timeout = timedelta(minutes=distributed_timeout_minutes)
nccl_communicator_config_path = standardize_path(nccl_communicator_config_path, check_read=True)
if pipeline_model_parallel_size == 2 and virtual_pipeline_model_parallel_size is not None:
megatron.core.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0
megatron.core.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = virtual_pipeline_model_parallel_size
@@ -261,6 +263,8 @@ def initialize_model_parallel_wrapper(initialize_model_parallel):
if args.hccl_group_buffer is not None:
parse_hccl_buffer_string(args.hccl_group_buffer)
nccl_communicator_config_path = standardize_path(nccl_communicator_config_path, check_read=True)
data_parallel_size = 1 # dp 1
rank = torch.distributed.get_rank()
all_ep_groups = []

View File

@@ -15,7 +15,7 @@ try:
from mindspeed_llm.core.pipeline_parallel.dualpipe.gpt_model import gpt_model_forward_backward_overlaping
from mindspeed_llm.core.pipeline_parallel.dualpipe.MTP_overlap import forward_overlap
except ImportError:
pass
print("[warning] failed import dualpipe modules, not support dualpipe")
from mindspeed_llm.core.transformer.multi_token_prediction import MultiTokenPredictionLayer, MTPLossAutoScaler

View File

@@ -44,4 +44,8 @@ def _add_moba_args(parser):
group.add_argument('--moba-calc-method', type=int, default=1,
help='moba calculation method. 1: naive attention with naive attention operations; 2: use flash'
'attention. default: 1')
group.add_argument('--trust-remote-code',
action='store_true',
default=False,
help='enable trust-remote-code for transformer to load model')
return parser

View File

@@ -8,6 +8,7 @@ import logging as logger
import argparse
import torch
import safetensors.torch
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
logger.basicConfig(format="")
logger.getLogger().setLevel(logger.INFO)
@@ -67,7 +68,7 @@ class CheckpointConverter:
try:
if filename.endswith(".bin"):
cur_weights = torch.load(file_path, map_location=torch.device('cpu'))
cur_weights = torch.load(file_path, map_location=torch.device('cpu'), weights_only=True)
model_dict.update(cur_weights)
print(f"Successfully loaded: {filename}")
loaded = True
@@ -349,7 +350,7 @@ class CheckpointConverter:
out_iteration, input_model_dir, src_model_file = self.get_latest_checkpoint_model_file(self.args.load_dir)
src_model = torch.load(src_model_file, map_location='cpu', weights_only=False)
src_model = torch.load(src_model_file, map_location='cpu', weights_only=True)
logger.info(f"Sample model {src_model_file} is loaded.\n")
return out_iteration, input_model_dir, src_model
@@ -381,7 +382,7 @@ class CheckpointConverter:
input_pp_rank
)
tp_models.append(torch.load(model_file, map_location='cpu', weights_only=False))
tp_models.append(torch.load(model_file, map_location='cpu', weights_only=True))
logger.info(f"Model {model_file} is loaded.")
if input_tp_rank > 1:
@@ -475,7 +476,7 @@ class CheckpointConverter:
dir_name += f"_{pp_idx:03d}"
save_path = os.path.join(args.save_dir, f"iter_{out_iteration:07d}", dir_name)
os.makedirs(save_path, exist_ok=True)
os.makedirs(save_path, mode=0o750, exist_ok=True)
return os.path.join(save_path, filename)
@@ -589,6 +590,8 @@ def run():
args, _ = parser.parse_known_args()
args.load_dir = standardize_path(args.load_dir, check_read=True)
converter = CheckpointConverter(args)
converter.main()

View File

@@ -18,8 +18,8 @@ import json
import os
import stat
import time
import torch
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
def get_json_from_file(json_file):
@@ -123,10 +123,11 @@ class ConvertBase:
self.mg_latest_ckpt_file_name = "latest_checkpointed_iteration.txt"
# hf model index_file
self.model_index_file = os.path.join(
self.args_cmd.hf_dir,
"pytorch_model.bin.index.json") if self.args_cmd.model_index_file is None \
index_file = os.path.join(self.args_cmd.hf_dir, "pytorch_model.bin.index.json")
self.model_index_file = index_file if self.args_cmd.model_index_file is None \
else self.args_cmd.model_index_file
self.model_index_file = standardize_path(self.model_index_file, check_read=True)
self.model_index_map = get_json_from_file(self.model_index_file)
# hf model config_file
self.config_file = os.path.join(
@@ -217,7 +218,7 @@ class ConvertBase:
hf_model[k] = f.get_tensor(k)
elif str(model_files).endswith(".bin"):
print(f"load file : {file_path}")
hf_model = torch.load(file_path, map_location='cpu', weights_only=False)
hf_model = torch.load(file_path, map_location='cpu', weights_only=True)
else:
raise ValueError(f"unsupported model file format. {os.path.splitext(hf_model)[-1]} ")
return hf_model
@@ -550,17 +551,18 @@ class ConvertHf2Mg(ConvertBase):
ep_rank=ep_rank)
save_dir = self.get_mg_model_save_dir(tp_rank=tp_rank, pp_rank=pp_rank, ep_rank=ep_rank,
iteration=iteration)
os.makedirs(save_dir, exist_ok=True)
os.makedirs(save_dir, mode=0o750, exist_ok=True)
torch.save(model_dict, os.path.join(save_dir, self.mg_model_file_name))
else: # Dense Model
model_dict = self._set_dense_mg_model(hf_model=hf_model, tp_rank=tp_rank, pp_rank=pp_rank)
save_dir = self.get_mg_model_save_dir(tp_rank=tp_rank, pp_rank=pp_rank, ep_rank=None,
iteration=iteration)
os.makedirs(save_dir, exist_ok=True)
os.makedirs(save_dir, mode=0o750, exist_ok=True)
torch.save(model_dict, os.path.join(save_dir, self.mg_model_file_name))
# write latest_checkpointed_iteration.txt
latest_ckpt_file_path = os.path.join(self.args_cmd.mg_dir, self.mg_latest_ckpt_file_name)
latest_ckpt_file_path = standardize_path(latest_ckpt_file_path, check_write=True)
modes = stat.S_IWUSR | stat.S_IRUSR | stat.S_IWGRP | stat.S_IRGRP
with os.fdopen(os.open(latest_ckpt_file_path, flags=os.O_RDWR | os.O_CREAT, mode=modes), 'w') as fout:
fout.write(iteration)
@@ -577,7 +579,7 @@ class ConvertMg2Hf(ConvertBase):
for tp_rank in range(self.tp_size):
mg_save_dir = self.get_mg_model_save_dir(tp_rank=tp_rank, pp_rank=pp_rank, ep_rank=None,
iteration=self.args_cmd.iteration)
mg_tp_model = torch.load(os.path.join(mg_save_dir, self.mg_model_file_name), map_location='cpu', weights_only=False)
mg_tp_model = torch.load(os.path.join(mg_save_dir, self.mg_model_file_name), map_location='cpu', weights_only=True)
mg_tp_models.append(mg_tp_model)
hf_model = {}
@@ -751,13 +753,13 @@ class ConvertMg2Hf(ConvertBase):
def _update_hf_model_file(self, hf_model, model_file):
file_path = os.path.join(self.args_cmd.hf_dir, model_file)
exist_model = torch.load(file_path, map_location='cpu', weights_only=False) if os.path.exists(file_path) else {}
exist_model = torch.load(file_path, map_location='cpu', weights_only=True) if os.path.exists(file_path) else {}
for param_key in hf_model.keys():
if self.get_hf_model_file_based_param_key(param_key) == model_file:
exist_model[param_key] = hf_model[param_key]
os.makedirs(os.path.dirname(file_path), exist_ok=True)
os.makedirs(os.path.dirname(file_path), mode=0o750, exist_ok=True)
torch.save(exist_model, file_path)
def run(self):

View File

@@ -373,7 +373,9 @@ def _load_checkpoint(model_provider, queue, args):
md = build_metadata(args, margs)
queue.put(md)
model_hf.get_modules_from_pretrained()
print(f"args.trust_remote_code:{args.trust_remote_code}")
model_hf.get_modules_from_pretrained(trust_remote_code=args.trust_remote_code)
model_mg.get_modules_from_config()
model_mg.update_module(model_hf)

View File

@@ -1,6 +1,7 @@
# Copyright (c) 2024, HUAWEI CORPORATION. All rights reserved.
import abc
import os
import ast
import sys
import re
import json
@@ -23,7 +24,7 @@ from megatron.core import tensor_parallel
from mindspeed_llm.training.utils import parse_args
from mindspeed_llm.training import model_provider_func_wrapper
from mindspeed_llm.training.checkpointing import load_checkpoint_wrapper
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
logger.basicConfig(format="")
logger.getLogger().setLevel(logger.INFO)
@@ -465,9 +466,9 @@ class HuggingfaceModel(ModelBase):
def initialize_args(self):
# Read huggingface args.
if self.args_cmd.save_model_type == 'hf':
cfg_dir = self.args_cmd.save_dir
cfg_dir = standardize_path(self.args_cmd.save_dir, check_write=True)
else:
cfg_dir = self.args_cmd.load_dir
cfg_dir = standardize_path(self.args_cmd.load_dir, check_read=True)
llama_args_path = os.path.join(cfg_dir, "config.json")
with open(llama_args_path) as f:
self.args = json.load(f)
@@ -501,7 +502,7 @@ class HuggingfaceModel(ModelBase):
self.args.save_lora_to_hf = self.args_cmd.save_lora_to_hf
self.args.noop_layers = self.args_cmd.noop_layers
def get_modules_from_config(self, device_map="cpu", trust_remote_code=True):
def get_modules_from_config(self, device_map="cpu", trust_remote_code=False):
# Load Huggingface model.
if self.args_cmd.save_model_type == "hf":
load_dir = self.args_cmd.save_dir
@@ -513,9 +514,9 @@ class HuggingfaceModel(ModelBase):
hf_model.to_empty(device=device_map)
self.module = [hf_model]
if hasattr(self.args, "torch_dtype") and self.args.torch_dtype in ["float16", "bfloat16"]:
self.module[0] = self.module[0].to(eval(f'torch.{self.args.torch_dtype}'))
self.module[0] = self.module[0].to(ast.literal_eval(f'torch.{self.args.torch_dtype}'))
def get_modules_from_pretrained(self, device_map="cpu", trust_remote_code=True):
def get_modules_from_pretrained(self, device_map="cpu", trust_remote_code=False):
# Load Huggingface model.
if self.args_cmd.save_model_type == "hf":
load_dir = self.args_cmd.save_dir
@@ -542,7 +543,8 @@ class HuggingfaceModel(ModelBase):
)
self.module = [get_peft_model(self.module[0], lora_config)]
if hasattr(self.args, "torch_dtype") and self.args.torch_dtype in ["float16", "bfloat16"]:
self.module[0] = self.module[0].to(eval(f'torch.{self.args.torch_dtype}'))
dtype = getattr(torch, self.args.torch_dtype)
self.module[0] = self.module[0].to(dtype)
def get_lora_key(self, layer_name, prefix):
return f"{layer_name}.{prefix}"

View File

@@ -16,7 +16,7 @@ logger.getLogger().setLevel(logger.INFO)
def load_data(file_path):
try:
data = torch.load(file_path, map_location='cpu', weights_only=False)
data = torch.load(file_path, map_location='cpu', weights_only=True)
return data
except Exception as e:
logger.info(f"Error while loading file '{file_path}': {e}")
@@ -74,7 +74,7 @@ class OptimBaseProcessor(abc.ABC):
@staticmethod
def check_mkdir(dir_path):
if not os.path.exists(dir_path):
os.makedirs(dir_path)
os.makedirs(dir_path, mode=0o750, exist_ok=True)
def get_ckpt_path(self, tp_rank, pp_rank, ep_rank=None, suffix=""):
"""
@@ -317,7 +317,7 @@ class OptimSourceProcessor(OptimBaseProcessor):
@staticmethod
def make_param_index_map(model_path):
weights = torch.load(model_path, map_location=torch.device('cpu'), weights_only=False)
weights = torch.load(model_path, map_location=torch.device('cpu'), weights_only=True)
# Count the number of models in the checkpoint
model_num = sum([1 if key.startswith("model") else 0 for key in weights.keys()])
@@ -478,7 +478,7 @@ class OptimSourceProcessor(OptimBaseProcessor):
optim_path = self.optimizer_paths[tp_rank][pp_rank][ep_rank]
logger.info(f"Splitting from {optim_path} ...")
merged_ckpt = torch.load(optim_path, map_location="cpu", weights_only=False)
merged_ckpt = torch.load(optim_path, map_location="cpu", weights_only=True)
if isinstance(merged_ckpt, dict):
merged_ckpt = [merged_ckpt]
@@ -643,7 +643,7 @@ class OptimTargetProcessor(OptimBaseProcessor):
for key in ["param", "exp_avg", "exp_avg_sq"]:
load_path = f"{ckpt_name}_{key}{ckpt_ext}"
logger.info(f" {key} is loaded from {load_path}.")
optim_ckpt = torch.load(load_path, map_location="cpu", weights_only=False)
optim_ckpt = torch.load(load_path, map_location="cpu", weights_only=True)
flatten_ckpt = self.flatten_optimizer_ckpt(optim_ckpt, pp_rank, key)

View File

@@ -9,7 +9,7 @@ from collections import defaultdict, OrderedDict
from tqdm import tqdm
import torch
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
logger.basicConfig(format="")
logger.getLogger().setLevel(logger.INFO)
@@ -39,7 +39,7 @@ class OptimConverter(abc.ABC):
def get_optim_param_from_src_model_ckpt(self):
ckpt_path = self.src_optim.model_paths[0][0][0]
model_ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=False)
model_ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True)
self.optim_param = model_ckpt['optimizer']
self.opt_param_scheduler = model_ckpt['opt_param_scheduler']
@@ -57,7 +57,7 @@ class OptimConverter(abc.ABC):
bool: True if successful, False otherwise.
"""
try:
model_ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=False)
model_ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True)
# Apply modifications
for key, value in modifications.items():

View File

@@ -539,9 +539,9 @@ def save_huggingface(args, model):
from .models import get_huggingface_model
model_hf = get_huggingface_model(args)
if args.load_hf_from_config:
model_hf.get_modules_from_config()
model_hf.get_modules_from_config(trust_remote_code=args.trust_remote_code)
else:
model_hf.get_modules_from_pretrained()
model_hf.get_modules_from_pretrained(trust_remote_code=args.trust_remote_code)
args_cmd = model_hf.get_args_cmd()
model_hf.update_module(model)

View File

@@ -34,7 +34,7 @@ from mindspeed_llm.tasks.evaluation.eval_utils.agi_utils import (
get_default_instruction,
get_pred_postprocess_func
)
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
logger = logging.getLogger(__name__)
@@ -42,7 +42,7 @@ class AGIEvalExam(DatasetEval):
def __init__(self, test_dir, eval_args,
instruction_template="{fewshot_template} {question}\n{question_template}\n{options}"
"\n{answer_template}"):
self.test_dir = test_dir
self.test_dir = standardize_path(test_dir, check_read=True)
self.instruction_template = instruction_template
self.batch_size = eval_args.evaluation_batch_size
self.rank = dist.get_rank()

View File

@@ -32,7 +32,7 @@ from mindspeed_llm.tasks.evaluation.eval_api.dataset_eval import DatasetEval
from mindspeed_llm.tasks.evaluation.eval_impl.template import BBH_TEMPLATE_DIR, BBH_COT_TEMPLATE_DIR, get_eval_template
from mindspeed_llm.tasks.evaluation.eval_utils.bbh_utils import bbh_mcq_postprocess, bbh_freeform_postprocess, bbh_true_or_false_questions
from mindspeed_llm.tasks.evaluation.utils import get_final_list_dataset
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
logger = logging.getLogger(__name__)
@@ -75,7 +75,7 @@ bbh_free_form_sets = [
class BBHEval(DatasetEval):
def __init__(self, test_dir, eval_args,
instruction_template="{fewshot_template}Q: {question}\nA:"):
self.test_dir = test_dir
self.test_dir = standardize_path(test_dir, check_read=True)
self.instruction_template = instruction_template
self.batch_size = eval_args.evaluation_batch_size
self.rank = dist.get_rank()

View File

@@ -28,7 +28,7 @@ from mindspeed_llm.tasks.evaluation.eval_api.chat import Chat
from mindspeed_llm.tasks.utils.error_utils import check_divisible_by_zero
from mindspeed_llm.tasks.evaluation.utils import get_final_list_dataset
from mindspeed_llm.tasks.evaluation.eval_utils.boolq_utils import first_capital_postprocess
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
logger = logging.getLogger(__name__)
@@ -36,7 +36,7 @@ logger = logging.getLogger(__name__)
class BoolqEval(DatasetEval):
def __init__(self, test_dir, eval_args,
instruction_template="{passage}\nQuestion: {question}?\nAnswer:"):
self.test_dir = test_dir
self.test_dir = standardize_path(test_dir, check_read=True)
self.instruction_template = instruction_template
self.alternative_prompt = "{title} -- {passage}\nQuestion: {question}\nA. Yes\nB. No\nAnswer:"
self.answer_reference = {'True': 'A', 'False': 'B', 'Yes': 'A', 'No': 'B', 'Y': 'A', 'N': 'B', 'T': 'A', 'F': 'B'}

View File

@@ -31,7 +31,7 @@ from mindspeed_llm.tasks.utils.error_utils import check_divisible_by_zero
from mindspeed_llm.tasks.evaluation.eval_utils.ceval_utils import format_ceval_templates, first_capital_postprocess
from mindspeed_llm.tasks.evaluation.utils import get_final_dataset
from mindspeed_llm.tasks.evaluation.eval_impl.template import CEVAL_TEMPLATE_DIR, get_eval_template
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
logger = logging.getLogger(__name__)
@@ -39,7 +39,7 @@ logger = logging.getLogger(__name__)
class CEvalExam(DatasetEval):
def __init__(self, test_dir, eval_args,
instruction_template="{fewshot_template}\n\n问:{question}\n答:"):
self.test_dir = test_dir
self.test_dir = standardize_path(test_dir, check_read=True)
self.instruction_template = instruction_template
self.batch_size = eval_args.evaluation_batch_size
self.rank = dist.get_rank()

View File

@@ -31,9 +31,8 @@ from mindspeed_llm.tasks.evaluation.eval_api.chat import Chat
from mindspeed_llm.tasks.utils.error_utils import check_divisible_by_zero
from mindspeed_llm.tasks.evaluation.eval_utils.cmmlu_utils import cmmlu_subject_mapping, first_option_postprocess, cmmlu_format_example
from mindspeed_llm.tasks.evaluation.utils import get_final_dataset
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
from .template import CMMLU_TEMPLATE_DIR, get_eval_template
logger = logging.getLogger(__name__)
@@ -43,7 +42,7 @@ class CmmluEval(DatasetEval):
"{question}\n答案: ",
output_template1=r".*(?P<答案>[A|B|C|D])\..*",
output_template2=r"(?P<答案>[A|B|C|D])"):
self.test_dir = test_dir
self.test_dir = standardize_path(test_dir, check_read=True)
self.instruction_template = instruction_template
self.output_template = [output_template1, output_template2]
self.batch_size = eval_args.evaluation_batch_size

View File

@@ -30,7 +30,7 @@ from mindspeed_llm.tasks.utils.error_utils import check_divisible_by_zero
from mindspeed_llm.tasks.evaluation.eval_utils.gsm8k_utils import four_shots_prompt, gsm8k_postprocess
from mindspeed_llm.tasks.evaluation.utils import get_final_list_dataset
from mindspeed_llm.tasks.evaluation.eval_impl.template import GSM8K_TEMPLATE_DIR
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
logger = logging.getLogger(__name__)
@@ -38,7 +38,7 @@ class Gsm8kEval(DatasetEval):
def __init__(self, test_dir, eval_args,
instruction_template="{fewshot_template}\n\n{question}",
output_template=r'The answer is (.*?) '):
self.test_dir = test_dir
self.test_dir = standardize_path(test_dir, check_read=True)
self.instruction_template = instruction_template
self.output_template = output_template
self.batch_size = eval_args.evaluation_batch_size

View File

@@ -18,7 +18,7 @@ from mindspeed_llm.tasks.evaluation.eval_api.chat import Chat
from mindspeed_llm.tasks.utils.error_utils import check_divisible_by_zero
from mindspeed_llm.tasks.evaluation.eval_utils.mmlu_utils import postprocess
from mindspeed_llm.tasks.evaluation.utils import get_final_list_dataset
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
logger = logging.getLogger(__name__)
@@ -27,7 +27,7 @@ class HellaswagEval(DatasetEval):
def __init__(self, test_dir, eval_args,
output_template1=r".*(?P<answer>[A|B|C|D])\..*",
output_template2=r"(?P<answer>[A|B|C|D])"):
self.test_dir = test_dir
self.test_dir = standardize_path(test_dir, check_read=True)
self.output_template = [output_template1, output_template2]
self.instruction_template = ('{ctx}\nQuestion: Which ending makes the most sense?\n'
'A. {A}\nB. {B}\nC. {C}\nD. {D}\n'

View File

@@ -19,6 +19,7 @@ import logging
import re
import sys
import subprocess
import ast
from typing import Iterable, Dict
import pandas as pd
import tqdm
@@ -32,17 +33,57 @@ from mindspeed_llm.tasks.evaluation.eval_api.dataset_eval import DatasetEval
from mindspeed_llm.tasks.evaluation.eval_api.chat import Chat
from mindspeed_llm.tasks.utils.error_utils import check_divisible_by_zero
from mindspeed_llm.training.utils import WRITE_FILE_DEFAULT_FLAGS, WRITE_FILE_DEFAULT_MODES
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
from mindspeed_llm.tasks.evaluation.eval_utils.human_utils import humaneval_postprocess, get_score
logger = logging.getLogger(__name__)
def is_code_dangerous(code: str, dangerous_patterns) -> bool:
"""AST 检测提权、外联、文件篡改"""
# 正则检测(快速过滤)
for pattern in dangerous_patterns:
if re.search(pattern, code):
return True
# AST 语义分析(防绕过)
try:
tree = ast.parse(code)
for node in ast.walk(tree):
if isinstance(node, ast.Call):
if isinstance(node.func, ast.Name):
if node.func.id in ("exec", "eval", "open", "os", "subprocess"):
return True
elif isinstance(node, (ast.Import, ast.ImportFrom)):
for alias in node.names:
if alias.name in ("os", "sys", "subprocess"):
return True
# 检测 os.system("sudo ...")
if isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute) and node.func.attr == "system":
return True
if any(isinstance(arg, ast.Str) and ("sudo" in arg.s or "curl" in arg.s) for arg in node.args):
return True
# 检测动态导入(如 __import__("os").system(...)
if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == "__import__":
return True
return False
except SyntaxError:
return True # 语法错误视为危险
def extract_answer_code(answer, task: dict):
"""
:param answer:
:param task:
:return:
"""
"""安全生成测试文件"""
if is_code_dangerous(answer, self.dangerous_patterns) or is_code_dangerous(task["test"], self.dangerous_patterns):
raise ValueError("Unsafe code detected")
task_id = task['task_id']
target_func = task['entry_point']
test_case = task['test']
@@ -51,7 +92,7 @@ def extract_answer_code(answer, task: dict):
code_lines = code.split("\n")
target_func_flag = False
if not os.path.exists(CODE_TEST_LOG_DIR):
os.makedirs(CODE_TEST_LOG_DIR)
os.makedirs(CODE_TEST_LOG_DIR, mode=0o750, exist_ok=True)
test_code_path = "{}/{}".format(CODE_TEST_LOG_DIR, save_file)
with os.fdopen(os.open(test_code_path, WRITE_FILE_DEFAULT_FLAGS, WRITE_FILE_DEFAULT_MODES), 'w') as f:
f.write("from typing import List\n")
@@ -85,7 +126,7 @@ def extract_answer_code(answer, task: dict):
class HumanEval(DatasetEval):
def __init__(self, test_dir, eval_args):
self.test_dir = test_dir
self.test_dir = standardize_path(test_dir, check_read=True)
instruction_template = eval_args.instruction_template
if instruction_template:
self.instruction_template = instruction_template
@@ -96,6 +137,11 @@ class HumanEval(DatasetEval):
self.file_pbar = None
self.task_pbar = None
self.prompt = 'Complete the following python code:\n{prompt}'
self.dangerous_patterns = []
with open("configs/dangerous_shell.json", "r", encoding="utf-8") as f:
self.dangerous_patterns = json.load(f)
print(self.dangerous_patterns)
def read_problems(self) -> Dict[str, Dict]:
return {task["task_id"]: task for task in self.stream_jsonl(self.test_dir)}

View File

@@ -30,6 +30,7 @@ from mindspeed_llm.tasks.evaluation.eval_api.chat import Chat
from mindspeed_llm.tasks.utils.error_utils import check_divisible_by_zero
from mindspeed_llm.tasks.evaluation.eval_utils.mmlu_utils import _format_example, postprocess
from mindspeed_llm.tasks.evaluation.utils import get_final_dataset
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
from .template import MMLU_TEMPLATE_DIR, get_eval_template
@@ -42,7 +43,7 @@ class MmluEval(DatasetEval):
"{question}\nAnswer:",
output_template1=r".*(?P<answer>[A|B|C|D])\..*",
output_template2=r"(?P<answer>[A|B|C|D])"):
self.test_dir = test_dir
self.test_dir = standardize_path(test_dir, check_read=True)
self.instruction_template = instruction_template
self.output_template = [output_template1, output_template2]
self.batch_size = eval_args.evaluation_batch_size

View File

@@ -157,7 +157,7 @@ def load_datasets(config: NeedleBenchConfig):
dataset = []
needle_file_path = os.path.join(config.dataset_dir, "needles.jsonl")
cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache")
os.makedirs(cache_dir, exist_ok=True)
os.makedirs(cache_dir, mode=0o750, exist_ok=True)
cache_file = "9b5ad71b2ce5302211f9c61530b329a4922fc6a4"
dest_filepath = os.path.join(cache_dir, cache_file)
# 参考docs中的needlebench-evaluation.md下载cl100k_base.tiktoken将其放置到该数据集文件夹

View File

@@ -1,5 +1,6 @@
# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
from functools import reduce
import re
import os
import stat
@@ -52,17 +53,34 @@ def safe_open(file_path: str, mode='r', encoding=None, permission_mode=0o600, is
mode, encoding=encoding)
def standardize_path(path: str, max_path_length=MAX_PATH_LENGTH, check_link=True):
def standardize_path(path: str, max_path_length=MAX_PATH_LENGTH, check_link=True, check_read=True, check_write=True):
"""
check path
param: path
return: data real path after check
"""
check_path_is_none(path)
if path:
path = os.path.realpath(path)
else:
return None
if os.path.exists(path):
if check_read and not os.access(path, os.R_OK):
raise RuntimeError(f"File {path} not readable")
if check_write and not os.access(path, os.W_OK):
raise RuntimeError(f"File {path} not writable")
else:
print(f"Path: {path} not exists")
check_path_length_lt(path, max_path_length)
if check_link:
check_path_is_link(path)
path = os.path.realpath(path)
pattern = r'(\.|/|_|-|\s|[~0-9a-zA-Z]|[\u4e00-\u9fa5])+'
if not re.fullmatch(pattern, path):
raise RuntimeError(f"Invalid input path: {path}")
return path

View File

@@ -7,7 +7,7 @@ try:
from mindspeed.core.pipeline_parallel.fb_overlap.modules.attention import launch_async_all2all
from mindspeed.core.pipeline_parallel.dualpipev.dualpipev_schedules import get_post_process_flag
except ImportError:
pass
print("[warning] failed import dualpipe modules, not support dualpipe")
from mindspeed.core.transformer.moe.comm_utils import async_all_gather
from mindspeed.core.tensor_parallel.random import CheckpointWithoutOutput
from megatron.training.utils import get_args

View File

@@ -2,7 +2,7 @@ import argparse
import gc
import json
import re
import os
import jsonlines
import pandas as pd
import torch
@@ -13,6 +13,7 @@ from vllm.distributed.parallel_state import (destroy_distributed_environment, de
from utils import blending_datasets, PromptGtAnswerDataset, apply_GenRM_template, rejection_sampling_processor
from mindspeed_llm.tasks.posttrain.verifier.rule_verifier import preprocess_box_response_for_qwen_prompt
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
def clean_up():
@@ -36,13 +37,13 @@ def batch_generate_vllm(args):
dummy_strategy.args = args
# configure tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.pretrain, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(args.pretrain, trust_remote_code=args.trust_remote_code)
# configure model
llm = LLM(
model=args.pretrain,
tensor_parallel_size=args.tp_size,
trust_remote_code=True,
trust_remote_code=args.trust_remote_code,
seed=args.seed,
max_num_seqs=args.max_num_seqs,
enable_prefix_caching=args.enable_prefix_caching,
@@ -107,7 +108,7 @@ def batch_GenRM_rejection_sampling(args):
llm = LLM(
model=args.pretrain,
tensor_parallel_size=args.tp_size,
trust_remote_code=True,
trust_remote_code=args.trust_remote_code,
seed=args.seed,
max_num_seqs=args.max_num_seqs,
enable_prefix_caching=args.enable_prefix_caching,
@@ -215,9 +216,15 @@ if __name__ == "__main__":
parser.add_argument("--iter", type=int, default=None,
help="Used to slice the datasets in range iter * rollout_batch_size: (iter + 1) * rollout_batch_size", )
parser.add_argument("--rollout-batch-size", type=int, default=2048, help="Number of samples to generate")
parser.add_argument('--trust-remote-code',
action='store_true',
default=False,
help='enable trust-remote-code for transformer to load model')
args = parser.parse_args()
args.output_path = standardize_path(args.output_path, check_write=True)
if args.task and args.task == "generate_vllm":
batch_generate_vllm(args)
elif args.task and args.task == "rejection_sampling":

View File

@@ -31,7 +31,7 @@ def blending_datasets(
ext = os.path.splitext(dataset)[-1]
# local python script
if ext == ".py" or (os.path.isdir(dataset) and os.path.exists(os.path.join(dataset, f"{dataset_basename}.py"))):
data = load_dataset(dataset, trust_remote_code=True)
data = load_dataset(dataset, trust_remote_code=False)
strategy.print(f"loaded {dataset} with python script")
# local text file
elif ext in [".json", ".jsonl", ".csv"]:

View File

@@ -13,7 +13,7 @@ from megatron.training import get_timers
try:
from mindspeed.core.pipeline_parallel.dualpipev.dualpipev_schedules import set_post_process_flag
except ImportError:
pass
print("[warning] failed import dualpipe modules, not support dualpipe")
from mindspeed_llm.training.utils import get_tune_attention_mask, get_finetune_data_on_this_tp_rank, generate_actual_seq_len
from mindspeed_llm.tasks.posttrain.base import BaseTrainer

View File

@@ -1,5 +1,5 @@
from typing import Union
import os
import torch
from megatron.core import mpu, dist_checkpointing
@@ -19,6 +19,7 @@ from megatron.training.training import compute_throughputs_and_append_to_progres
from megatron.training.utils import unwrap_model, print_rank_0, append_to_progress_log
from megatron.training.yaml_arguments import core_transformer_config_from_yaml
from mindspeed_llm.tasks.posttrain.orm.orm_model import GPTRewardModel
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
def model_provider(is_reward_model=False, pre_process=True, post_process=True) -> Union[GPTModel]:
@@ -137,6 +138,8 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler,
if save_model_type:
save_path = args.save + '/' + save_model_type
save_path = standardize_path(save_path, check_write=True)
ckpt_format = args.dist_ckpt_format if args.use_dist_ckpt else 'torch'
print_rank_0('saving checkpoint at iteration {:7d} to {} in {} format'.format(
iteration, save_path, ckpt_format))

View File

@@ -38,7 +38,7 @@ def parse_digits(num):
try:
return float(num) / 100
except:
pass
return None
return None

View File

@@ -1,7 +1,7 @@
import random
import re
from typing import TypeVar, Iterable, List, Union, Any, Dict
import ast
import regex
import sympy
from latex2sympy2 import latex2sympy
@@ -75,7 +75,7 @@ def convert_word_number(text: str) -> str:
try:
text = str(w2n.word_to_num(text))
except ValueError:
pass
return None
return text
@@ -468,7 +468,7 @@ def extract_theoremqa_answer(pred: str, answer_flag: bool = True):
pred = clean_units(pred)
try:
tmp = str(latex2sympy(pred))
pred = str(eval(tmp))
pred = str(ast.literal_eval(tmp))
except Exception:
if re.match(r"-?[\d\.]+\s\D+$", pred):
pred = pred.split(" ")[0]

View File

@@ -21,16 +21,14 @@ def load_jsonl(file: Union[str, Path]) -> Iterable[Any]:
yield json.loads(line)
except json.JSONDecodeError as e:
print("Error in loading JSON:", line, "Error:", e)
pass
except Exception as e:
print("Unexpected error in loading:", line, "Error:", e)
pass
def save_jsonl(samples, save_path):
# ensure path
folder = os.path.dirname(save_path)
os.makedirs(folder, exist_ok=True)
os.makedirs(folder, mode=0o750, exist_ok=True)
with open(save_path, "w", encoding="utf-8") as f:
for sample in samples:

View File

@@ -1034,6 +1034,7 @@ def build_dataset(args):
# for MOSS, streaming is needed.
args.streaming = True
if args.hf_datasets_params:
args.hf_datasets_params = standardize_path(args.hf_datasets_params, check_read=True)
with open(args.hf_datasets_params, 'r') as fin:
param_dict = json.load(fin)
return load_dataset(**param_dict)

View File

@@ -190,7 +190,8 @@ class ToolFormatter(Formatter):
return [default_tool_formatter(tools)]
else:
raise NotImplementedError
except Exception:
except Exception as e:
print(f"[warning] Unexpected error processing content: {content}. Error: {e}")
return [""]
def extract(self, content: str) -> Union[str, Tuple[str, str]]:

View File

@@ -23,7 +23,7 @@ from datasets import load_dataset, concatenate_datasets, interleave_datasets
from mindspeed_llm.tasks.preprocess.templates import Role
from mindspeed_llm.tasks.preprocess.parser import InstructionDatasetAttr
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@@ -103,6 +103,7 @@ def get_dataset_list(data_args) -> List["InstructionDatasetAttr"]:
else:
dataset_names = []
data_args.dataset_dir = standardize_path(data_args.dataset_dir, check_read=True)
try:
with open(os.path.join(data_args.dataset_dir, DATA_CONFIG), "r") as f:
dataset_info = json.load(f)

View File

@@ -820,6 +820,10 @@ def _add_training_args(parser):
action='store_true',
default=False,
help='enable deterministic computing for npu')
group.add_argument('--trust-remote-code',
action='store_true',
default=False,
help='enable trust-remote-code for transformer to load model')
group.add_argument('--jit-compile', action='store_true', default=False,
help='Setting jit compile mode to True')
group.add_argument('--prompt-type', type=str, default=None,

View File

@@ -123,7 +123,7 @@ def _load_base_checkpoint(load_dir, rank0=False, sharded_state_dict=None,
return state_dict, checkpoint_name, release
try:
state_dict = torch.load(checkpoint_name, map_location='cpu', weights_only=False)
state_dict = torch.load(checkpoint_name, map_location='cpu', weights_only=True)
except ModuleNotFoundError:
from megatron.legacy.fp16_deprecated import loss_scaler
# For backward compatibility.
@@ -134,7 +134,7 @@ def _load_base_checkpoint(load_dir, rank0=False, sharded_state_dict=None,
sys.modules['megatron.fp16.loss_scaler'] = sys.modules[
'megatron.legacy.fp16_deprecated.loss_scaler']
sys.modules['megatron.model'] = sys.modules['megatron.legacy.model']
state_dict = torch.load(checkpoint_name, map_location='cpu', weights_only=False)
state_dict = torch.load(checkpoint_name, map_location='cpu', weights_only=True)
sys.modules.pop('fp16.loss_scaler', None)
sys.modules.pop('megatron.fp16.loss_scaler', None)
sys.modules.pop('megatron.model', None)

View File

@@ -49,6 +49,7 @@ def build_tokenizer(args):
model_max_length=args.seq_length,
use_fast=args.tokenizer_not_use_fast,
prompt_type=args.prompt_type,
trust_remote_code=args.trust_remote_code,
**hf_tokenizer_kwargs
)
@@ -108,7 +109,7 @@ class TokenizerAdaptor:
class _AutoTokenizer(MegatronTokenizer):
"""AutoTokenizer for Hf Pretrained model loading."""
def __init__(self, tokenizer_name_or_path, vocab_extra_ids, model_max_length, use_fast, prompt_type=None, **kwargs):
def __init__(self, tokenizer_name_or_path, vocab_extra_ids, model_max_length, use_fast, prompt_type=None, trust_remote_code=False, **kwargs):
name = tokenizer_name_or_path
super().__init__(name)
hf_tokenizer_kwargs = kwargs
@@ -117,7 +118,7 @@ class _AutoTokenizer(MegatronTokenizer):
hf_tokenizer_kwargs["model_max_length"] = model_max_length
hf_tokenizer_kwargs["use_fast"] = use_fast
hf_tokenizer_kwargs["trust_remote_code"] = True
hf_tokenizer_kwargs["trust_remote_code"] = trust_remote_code
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, **hf_tokenizer_kwargs, local_files_only=True)
if (prompt_type is None) and (self.tokenizer.pad_token_id is None):
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id

View File

@@ -195,6 +195,10 @@ def add_tokenizer_args(parser):
default=[],
help="The labels represent the correctness of each reasoning step in the entire reasoning process.",
)
parser.add_argument('--trust-remote-code',
action='store_true',
default=False,
help='enable trust-remote-code for transformer to load model')
def add_output_args(parser):

View File

@@ -100,6 +100,7 @@ ROPE_ARGS="
GPT_ARGS="
--trust-remote-code \
--no-gradient-accumulation-fusion \
--spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \
--reset-position-ids \

View File

@@ -46,6 +46,7 @@ MLA_ARGS="
MOE_ARGS="
--trust-remote-code \
--moe-grouped-gemm \
--moe-permutation-async-comm \
--use-fused-moe-token-permute-and-unpermute \

View File

@@ -34,6 +34,7 @@ DISTRIBUTED_ARGS="
"
GPT_ARGS="
--trust-remote-code \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--sequence-parallel \

View File

@@ -31,6 +31,7 @@ DISTRIBUTED_ARGS="
"
GPT_ARGS="
--trust-remote-code \
--use-mcore-models \
--transformer-impl local \
--tensor-model-parallel-size ${TP} \

View File

@@ -97,7 +97,7 @@ def compare_state_dicts(state_dict1, state_dict2):
def process_file(file_path):
data = torch.load(file_path, map_location='cpu', weights_only=False)
data = torch.load(file_path, map_location='cpu', weights_only=True)
layer_ckpt = {}
# 兼容带vpp的权重
for key in data.keys():
@@ -164,8 +164,8 @@ def weight_compare(dir_1, dir_2, suffix="pt", use_md5=False):
if use_md5:
are_equal = (get_md5sum(path_1) == get_md5sum(path_2))
else:
state_dict1 = torch.load(path_1, weights_only=False)
state_dict2 = torch.load(path_2, weights_only=False)
state_dict1 = torch.load(path_1, weights_only=True)
state_dict2 = torch.load(path_2, weights_only=True)
are_equal = compare_state_dicts(state_dict1, state_dict2)
if not are_equal:
return False
@@ -192,8 +192,8 @@ def weight_compare_optim(dir_1, dir_2, suffix="pt", use_md5=False):
if use_md5:
are_equal = (get_md5sum(path_1) == get_md5sum(path_2))
else:
state_dict1 = torch.load(path_1, weights_only=False)
state_dict2 = torch.load(path_2, weights_only=False)
state_dict1 = torch.load(path_1, weights_only=True)
state_dict2 = torch.load(path_2, weights_only=True)
are_equal = compare_state_dicts(state_dict1, state_dict2)
if not are_equal:

View File

@@ -2,6 +2,7 @@
"test_deepseek2_hf2mcore_tp1pp4ep8": [
{
"param": {
"trust-remote-code":null,
"model-type":"GPT",
"load-model-type":"hf",
"save-model-type":"mg",
@@ -23,6 +24,7 @@
"test_deepseek2_mcore2hf_tp1pp4ep8": [
{
"param": {
"trust-remote-code":null,
"model-type":"GPT",
"load-model-type":"mg",
"save-model-type": "hf",