mirror of
https://gitee.com/ascend/MindSpeed-LLM.git
synced 2025-12-06 11:28:59 +08:00
!3113 [pytorch][bugfix]fix some bug for icsl
Merge pull request !3113 from jzh/210_uicsl
This commit is contained in:
@@ -46,7 +46,9 @@
|
||||
|
||||
1. 建议用户结合运行资源状况编写对应训练脚本。若训练脚本与资源状况不匹配,如数据集加载内存大小超出内存容量限制、训练脚本在本地生成数据超过磁盘空间大小等情况,可能引发错误并导致进程意外退出。
|
||||
2. MindSpeed-LLM内部用到了pytorch,可能会因为版本不匹配导致运行错误,具体可参考pytorch[安全声明](https://gitee.com/ascend/pytorch#%E5%AE%89%E5%85%A8%E5%A3%B0%E6%98%8E)。
|
||||
|
||||
3. 出于安全考虑torch.load目前加载都采用了weights_only=True的配置,请用户根据需要进行配置调整。
|
||||
4. HumanEval使用了subprocess.run,存在安全风险,为了不影响功能正常使用,做了一些安全校验规避,请用户根据需要自行构建黑名单,完善安全问题。
|
||||
5. 因为安全问题,需要设置trust_remote_code=False,无法远程加载Transformer官方仓库未支持的开源模型,如需要,请手动配置--trust-remote-code参量。
|
||||
|
||||
## 公网地址声明
|
||||
|
||||
|
||||
9
configs/dangerous_shell.json
Normal file
9
configs/dangerous_shell.json
Normal file
@@ -0,0 +1,9 @@
|
||||
[
|
||||
r"os\.(system|popen|exec|setuid|setgid|chroot)\s*\(",
|
||||
r"subprocess\.(run|Popen|call)\s*\(",
|
||||
r"pty\.spawn\s*\(",
|
||||
r"(requests|urllib|socket|httpx)\.(get|post|urlopen|connect)\s*\(",
|
||||
r"open\s*\(",
|
||||
r"os\.(remove|rename|chmod|chown|mkdir)\s*\(",
|
||||
r"(eval|exec|__import__|globals|locals)\s*\("
|
||||
]
|
||||
@@ -76,6 +76,10 @@ def main():
|
||||
help='Enable only save lora-checkpoint to hf')
|
||||
parser.add_argument('--load-checkpoint-loosely', action='store_true', default=False,
|
||||
help='Enable loading checkpoint not strictly.')
|
||||
parser.add_argument('--trust-remote-code',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='enable trust-remote-code for transformer to load model')
|
||||
known_args, _ = parser.parse_known_args()
|
||||
|
||||
|
||||
|
||||
@@ -380,7 +380,7 @@ def main():
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=args.load
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path, trust_remote_code=True, local_files_only=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path, trust_remote_code=False, local_files_only=True)
|
||||
|
||||
rank = dist.get_rank()
|
||||
if 'cmmlu' in args.task:
|
||||
|
||||
@@ -12,7 +12,7 @@ import safetensors
|
||||
import torch
|
||||
import safetensors.torch
|
||||
import bitsandbytes as bnb
|
||||
|
||||
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
|
||||
logger.basicConfig(format="")
|
||||
logger.getLogger().setLevel(logger.INFO)
|
||||
|
||||
@@ -75,8 +75,8 @@ class CkptConvert(object):
|
||||
self.vpp_stage = vpp_stage
|
||||
if vpp_stage is not None:
|
||||
self.vpp_size = self.num_layers // self.pp_size // self.vpp_stage
|
||||
self.hf_model_path = hf_model_path
|
||||
self.mg_save_path = mg_save_path
|
||||
self.hf_model_path = standardize_path(hf_model_path, check_read=True)
|
||||
self.mg_save_path = standardize_path(mg_save_path, check_write=True)
|
||||
self.num_layer_list = num_layer_list
|
||||
self.noop_layers = noop_layers
|
||||
self.moe_grouped_gemm = moe_grouped_gemm
|
||||
@@ -138,7 +138,7 @@ class CkptConvert(object):
|
||||
"""megatron model path"""
|
||||
iter_mg_path = os.path.join(mg_path, "iter_0000001")
|
||||
if not os.path.exists(mg_path):
|
||||
os.makedirs(mg_path, exist_ok=True)
|
||||
os.makedirs(mg_path, mode=0o750, exist_ok=True)
|
||||
|
||||
with open(os.path.join(mg_path, "latest_checkpointed_iteration.txt"), 'w') as f:
|
||||
f.write("1")
|
||||
@@ -786,7 +786,7 @@ class CkptConvert(object):
|
||||
for tp_rank in range(self.tp_size):
|
||||
save_prefix = self.generate_mg_weights_dir(tp_rank=tp_rank, pp_rank=pp_rank, ep_rank=ep_rank)
|
||||
parallel_save_path = os.path.join(save_model_path, save_prefix)
|
||||
os.makedirs(parallel_save_path)
|
||||
os.makedirs(parallel_save_path, mode=0o750, exist_ok=True)
|
||||
save_file_name = os.path.join(parallel_save_path, "model_optim_rng.pt")
|
||||
logger.info(f"Saving to {save_file_name}")
|
||||
|
||||
@@ -845,7 +845,7 @@ class CkptConvert(object):
|
||||
for tp_rank in range(self.tp_size):
|
||||
save_prefix = self.generate_mg_weights_dir(tp_rank=tp_rank, pp_rank=pp_rank, ep_rank=ep_rank)
|
||||
parallel_save_path = os.path.join(save_model_path, save_prefix)
|
||||
os.makedirs(parallel_save_path, exist_ok=True)
|
||||
os.makedirs(parallel_save_path, mode=0o750, exist_ok=True)
|
||||
save_file_name = os.path.join(parallel_save_path, "model_optim_rng.pt")
|
||||
logger.info(f"Saving to {save_file_name}")
|
||||
model_dict = {"checkpoint_version": 3.0, "iteration": 1}
|
||||
|
||||
@@ -14,7 +14,7 @@ import tqdm
|
||||
import torch
|
||||
import torch_npu
|
||||
import safetensors.torch
|
||||
|
||||
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
|
||||
logger.basicConfig(format="")
|
||||
logger.getLogger().setLevel(logger.INFO)
|
||||
|
||||
@@ -33,7 +33,7 @@ GLOBAL_LM_HEAD_WEIGHTS = None
|
||||
|
||||
def load_data(file_path):
|
||||
logger.info(f"Loading the checkpoint from {file_path}.")
|
||||
return torch.load(file_path, map_location='cpu', weights_only=False)
|
||||
return torch.load(file_path, map_location='cpu', weights_only=True)
|
||||
|
||||
|
||||
def tensor_memory_size(tensor):
|
||||
@@ -73,15 +73,15 @@ class MgCkptConvert(object):
|
||||
self.ep_size = ep_size
|
||||
self.vpp_stage = vpp_stage
|
||||
|
||||
self.mg_model_path = mg_model_path
|
||||
self.hf_save_path = hf_save_path
|
||||
self.mg_model_path = standardize_path(mg_model_path, check_read=True)
|
||||
self.hf_save_path = standardize_path(hf_save_path, check_write=True)
|
||||
self.lora_model_path = lora_model_path
|
||||
self.iter_path = self.get_iter_path(self.mg_model_path)
|
||||
if self.lora_model_path is not None:
|
||||
self.lora_iter_path = self.get_iter_path(self.lora_model_path)
|
||||
|
||||
if not os.path.exists(self.hf_save_path):
|
||||
os.makedirs(self.hf_save_path)
|
||||
os.makedirs(self.hf_save_path, mode=0o750, exist_ok=True)
|
||||
|
||||
self.num_layers = num_layers
|
||||
self.noop_layers = noop_layers
|
||||
@@ -194,7 +194,7 @@ class MgCkptConvert(object):
|
||||
|
||||
directory = os.path.join(ckpt_path, f'iter_{iteration:07d}')
|
||||
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
os.makedirs(directory, mode=0o750, exist_ok=True)
|
||||
|
||||
return directory
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ from megatron.core.datasets.gpt_dataset import (_build_document_index,
|
||||
_build_shuffle_index
|
||||
)
|
||||
from mindspeed_llm.tasks.utils.error_utils import GPTDatasetSampleIndexError
|
||||
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
|
||||
from .blended_megatron_dataset_builder import need_to_build_dataset
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -65,11 +66,13 @@ def _build_document_sample_shuffle_indices(
|
||||
Returns:
|
||||
Tuple[numpy.ndarray, numpy.ndarray]: The document index, the sample index, and the shuffle index
|
||||
"""
|
||||
|
||||
path_to_cache = self.config.path_to_cache
|
||||
if path_to_cache is None and not self.config.mock:
|
||||
path_to_cache = os.path.join(
|
||||
self.dataset.path_prefix, "cache", f"{type(self).__name__}_indices"
|
||||
)
|
||||
path_to_cache = standardize_path(path_to_cache, check_write=True)
|
||||
|
||||
# start of megatron_adaptation,
|
||||
# here we change from (class)GPTDataset._build_document_sample_shuffle_indices
|
||||
@@ -196,8 +199,7 @@ def _build_document_sample_shuffle_indices(
|
||||
)
|
||||
|
||||
if any(sample_index[:, 0] < 0):
|
||||
_url = "https://gitee.com/ascend/MindSpeed-LLM/wikis/megatron%20data%20helpers%E5%8F%AF%E8%83%BD%E5%BC%95%E5%85%A5%E7%9A%84%E9%97%AE%E9%A2%98"
|
||||
raise GPTDatasetSampleIndexError(f"Bad sample index. Visit {_url} for more information")
|
||||
raise GPTDatasetSampleIndexError(f"Bad sample index.")
|
||||
|
||||
# Build the shuffle index
|
||||
if separate_final_epoch:
|
||||
@@ -210,7 +212,7 @@ def _build_document_sample_shuffle_indices(
|
||||
)
|
||||
|
||||
if path_to_cache:
|
||||
os.makedirs(path_to_cache, exist_ok=True)
|
||||
os.makedirs(path_to_cache, mode=0o750, exist_ok=True)
|
||||
# Write the description
|
||||
with open(path_to_description, "wt") as writer:
|
||||
writer.write(self.unique_description)
|
||||
@@ -256,8 +258,7 @@ def _build_document_sample_shuffle_indices(
|
||||
sample_index = numpy.load(path_to_sample_index, allow_pickle=True, mmap_mode='r')
|
||||
|
||||
if any(sample_index[:, 0] < 0):
|
||||
_url = "https://gitee.com/ascend/MindSpeed-LLM/wikis/megatron%20data%20helpers%E5%8F%AF%E8%83%BD%E5%BC%95%E5%85%A5%E7%9A%84%E9%97%AE%E9%A2%98"
|
||||
raise GPTDatasetSampleIndexError(f"Bad sample index. Visit {_url} for more information")
|
||||
raise GPTDatasetSampleIndexError(f"Bad sample index.")
|
||||
|
||||
t_end = time.time()
|
||||
log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Expert parallel groups."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from functools import wraps
|
||||
from typing import Optional
|
||||
@@ -24,7 +24,7 @@ import torch_npu
|
||||
import megatron
|
||||
from megatron.core.parallel_state import get_context_parallel_world_size, get_nccl_options
|
||||
from mindspeed.core.parallel_state import hccl_buffer_auto_adaptive, parse_hccl_buffer_string
|
||||
|
||||
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
|
||||
_EXPERT_PARALLEL_GROUP = None
|
||||
_MPU_EXPERT_MODEL_PARALLEL_RANK = None
|
||||
_MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE = None
|
||||
@@ -48,6 +48,8 @@ def initialize_model_parallel_decorator(initialize_model_parallel):
|
||||
from megatron.training.utils import print_rank_0
|
||||
timeout = timedelta(minutes=distributed_timeout_minutes)
|
||||
|
||||
nccl_communicator_config_path = standardize_path(nccl_communicator_config_path, check_read=True)
|
||||
|
||||
if pipeline_model_parallel_size == 2 and virtual_pipeline_model_parallel_size is not None:
|
||||
megatron.core.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0
|
||||
megatron.core.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = virtual_pipeline_model_parallel_size
|
||||
@@ -261,6 +263,8 @@ def initialize_model_parallel_wrapper(initialize_model_parallel):
|
||||
if args.hccl_group_buffer is not None:
|
||||
parse_hccl_buffer_string(args.hccl_group_buffer)
|
||||
|
||||
nccl_communicator_config_path = standardize_path(nccl_communicator_config_path, check_read=True)
|
||||
|
||||
data_parallel_size = 1 # dp 1
|
||||
rank = torch.distributed.get_rank()
|
||||
all_ep_groups = []
|
||||
|
||||
@@ -15,7 +15,7 @@ try:
|
||||
from mindspeed_llm.core.pipeline_parallel.dualpipe.gpt_model import gpt_model_forward_backward_overlaping
|
||||
from mindspeed_llm.core.pipeline_parallel.dualpipe.MTP_overlap import forward_overlap
|
||||
except ImportError:
|
||||
pass
|
||||
print("[warning] failed import dualpipe modules, not support dualpipe")
|
||||
|
||||
from mindspeed_llm.core.transformer.multi_token_prediction import MultiTokenPredictionLayer, MTPLossAutoScaler
|
||||
|
||||
|
||||
@@ -44,4 +44,8 @@ def _add_moba_args(parser):
|
||||
group.add_argument('--moba-calc-method', type=int, default=1,
|
||||
help='moba calculation method. 1: naive attention with naive attention operations; 2: use flash'
|
||||
'attention. default: 1')
|
||||
group.add_argument('--trust-remote-code',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='enable trust-remote-code for transformer to load model')
|
||||
return parser
|
||||
|
||||
@@ -8,6 +8,7 @@ import logging as logger
|
||||
import argparse
|
||||
import torch
|
||||
import safetensors.torch
|
||||
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
|
||||
logger.basicConfig(format="")
|
||||
logger.getLogger().setLevel(logger.INFO)
|
||||
|
||||
@@ -67,7 +68,7 @@ class CheckpointConverter:
|
||||
|
||||
try:
|
||||
if filename.endswith(".bin"):
|
||||
cur_weights = torch.load(file_path, map_location=torch.device('cpu'))
|
||||
cur_weights = torch.load(file_path, map_location=torch.device('cpu'), weights_only=True)
|
||||
model_dict.update(cur_weights)
|
||||
print(f"Successfully loaded: {filename}")
|
||||
loaded = True
|
||||
@@ -349,7 +350,7 @@ class CheckpointConverter:
|
||||
|
||||
out_iteration, input_model_dir, src_model_file = self.get_latest_checkpoint_model_file(self.args.load_dir)
|
||||
|
||||
src_model = torch.load(src_model_file, map_location='cpu', weights_only=False)
|
||||
src_model = torch.load(src_model_file, map_location='cpu', weights_only=True)
|
||||
|
||||
logger.info(f"Sample model {src_model_file} is loaded.\n")
|
||||
return out_iteration, input_model_dir, src_model
|
||||
@@ -381,7 +382,7 @@ class CheckpointConverter:
|
||||
input_pp_rank
|
||||
)
|
||||
|
||||
tp_models.append(torch.load(model_file, map_location='cpu', weights_only=False))
|
||||
tp_models.append(torch.load(model_file, map_location='cpu', weights_only=True))
|
||||
logger.info(f"Model {model_file} is loaded.")
|
||||
|
||||
if input_tp_rank > 1:
|
||||
@@ -475,7 +476,7 @@ class CheckpointConverter:
|
||||
dir_name += f"_{pp_idx:03d}"
|
||||
|
||||
save_path = os.path.join(args.save_dir, f"iter_{out_iteration:07d}", dir_name)
|
||||
os.makedirs(save_path, exist_ok=True)
|
||||
os.makedirs(save_path, mode=0o750, exist_ok=True)
|
||||
|
||||
return os.path.join(save_path, filename)
|
||||
|
||||
@@ -589,6 +590,8 @@ def run():
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
args.load_dir = standardize_path(args.load_dir, check_read=True)
|
||||
|
||||
converter = CheckpointConverter(args)
|
||||
converter.main()
|
||||
|
||||
|
||||
@@ -18,8 +18,8 @@ import json
|
||||
import os
|
||||
import stat
|
||||
import time
|
||||
|
||||
import torch
|
||||
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
|
||||
|
||||
|
||||
def get_json_from_file(json_file):
|
||||
@@ -123,10 +123,11 @@ class ConvertBase:
|
||||
self.mg_latest_ckpt_file_name = "latest_checkpointed_iteration.txt"
|
||||
|
||||
# hf model index_file
|
||||
self.model_index_file = os.path.join(
|
||||
self.args_cmd.hf_dir,
|
||||
"pytorch_model.bin.index.json") if self.args_cmd.model_index_file is None \
|
||||
index_file = os.path.join(self.args_cmd.hf_dir, "pytorch_model.bin.index.json")
|
||||
self.model_index_file = index_file if self.args_cmd.model_index_file is None \
|
||||
else self.args_cmd.model_index_file
|
||||
self.model_index_file = standardize_path(self.model_index_file, check_read=True)
|
||||
|
||||
self.model_index_map = get_json_from_file(self.model_index_file)
|
||||
# hf model config_file
|
||||
self.config_file = os.path.join(
|
||||
@@ -217,7 +218,7 @@ class ConvertBase:
|
||||
hf_model[k] = f.get_tensor(k)
|
||||
elif str(model_files).endswith(".bin"):
|
||||
print(f"load file : {file_path}")
|
||||
hf_model = torch.load(file_path, map_location='cpu', weights_only=False)
|
||||
hf_model = torch.load(file_path, map_location='cpu', weights_only=True)
|
||||
else:
|
||||
raise ValueError(f"unsupported model file format. {os.path.splitext(hf_model)[-1]} ")
|
||||
return hf_model
|
||||
@@ -550,17 +551,18 @@ class ConvertHf2Mg(ConvertBase):
|
||||
ep_rank=ep_rank)
|
||||
save_dir = self.get_mg_model_save_dir(tp_rank=tp_rank, pp_rank=pp_rank, ep_rank=ep_rank,
|
||||
iteration=iteration)
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
os.makedirs(save_dir, mode=0o750, exist_ok=True)
|
||||
torch.save(model_dict, os.path.join(save_dir, self.mg_model_file_name))
|
||||
else: # Dense Model
|
||||
model_dict = self._set_dense_mg_model(hf_model=hf_model, tp_rank=tp_rank, pp_rank=pp_rank)
|
||||
save_dir = self.get_mg_model_save_dir(tp_rank=tp_rank, pp_rank=pp_rank, ep_rank=None,
|
||||
iteration=iteration)
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
os.makedirs(save_dir, mode=0o750, exist_ok=True)
|
||||
torch.save(model_dict, os.path.join(save_dir, self.mg_model_file_name))
|
||||
|
||||
# write latest_checkpointed_iteration.txt
|
||||
latest_ckpt_file_path = os.path.join(self.args_cmd.mg_dir, self.mg_latest_ckpt_file_name)
|
||||
latest_ckpt_file_path = standardize_path(latest_ckpt_file_path, check_write=True)
|
||||
modes = stat.S_IWUSR | stat.S_IRUSR | stat.S_IWGRP | stat.S_IRGRP
|
||||
with os.fdopen(os.open(latest_ckpt_file_path, flags=os.O_RDWR | os.O_CREAT, mode=modes), 'w') as fout:
|
||||
fout.write(iteration)
|
||||
@@ -577,7 +579,7 @@ class ConvertMg2Hf(ConvertBase):
|
||||
for tp_rank in range(self.tp_size):
|
||||
mg_save_dir = self.get_mg_model_save_dir(tp_rank=tp_rank, pp_rank=pp_rank, ep_rank=None,
|
||||
iteration=self.args_cmd.iteration)
|
||||
mg_tp_model = torch.load(os.path.join(mg_save_dir, self.mg_model_file_name), map_location='cpu', weights_only=False)
|
||||
mg_tp_model = torch.load(os.path.join(mg_save_dir, self.mg_model_file_name), map_location='cpu', weights_only=True)
|
||||
mg_tp_models.append(mg_tp_model)
|
||||
|
||||
hf_model = {}
|
||||
@@ -751,13 +753,13 @@ class ConvertMg2Hf(ConvertBase):
|
||||
|
||||
def _update_hf_model_file(self, hf_model, model_file):
|
||||
file_path = os.path.join(self.args_cmd.hf_dir, model_file)
|
||||
exist_model = torch.load(file_path, map_location='cpu', weights_only=False) if os.path.exists(file_path) else {}
|
||||
exist_model = torch.load(file_path, map_location='cpu', weights_only=True) if os.path.exists(file_path) else {}
|
||||
|
||||
for param_key in hf_model.keys():
|
||||
if self.get_hf_model_file_based_param_key(param_key) == model_file:
|
||||
exist_model[param_key] = hf_model[param_key]
|
||||
|
||||
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
||||
os.makedirs(os.path.dirname(file_path), mode=0o750, exist_ok=True)
|
||||
torch.save(exist_model, file_path)
|
||||
|
||||
def run(self):
|
||||
|
||||
@@ -373,7 +373,9 @@ def _load_checkpoint(model_provider, queue, args):
|
||||
md = build_metadata(args, margs)
|
||||
queue.put(md)
|
||||
|
||||
model_hf.get_modules_from_pretrained()
|
||||
print(f"args.trust_remote_code:{args.trust_remote_code}")
|
||||
|
||||
model_hf.get_modules_from_pretrained(trust_remote_code=args.trust_remote_code)
|
||||
model_mg.get_modules_from_config()
|
||||
|
||||
model_mg.update_module(model_hf)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# Copyright (c) 2024, HUAWEI CORPORATION. All rights reserved.
|
||||
import abc
|
||||
import os
|
||||
import ast
|
||||
import sys
|
||||
import re
|
||||
import json
|
||||
@@ -23,7 +24,7 @@ from megatron.core import tensor_parallel
|
||||
from mindspeed_llm.training.utils import parse_args
|
||||
from mindspeed_llm.training import model_provider_func_wrapper
|
||||
from mindspeed_llm.training.checkpointing import load_checkpoint_wrapper
|
||||
|
||||
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
|
||||
logger.basicConfig(format="")
|
||||
logger.getLogger().setLevel(logger.INFO)
|
||||
|
||||
@@ -465,9 +466,9 @@ class HuggingfaceModel(ModelBase):
|
||||
def initialize_args(self):
|
||||
# Read huggingface args.
|
||||
if self.args_cmd.save_model_type == 'hf':
|
||||
cfg_dir = self.args_cmd.save_dir
|
||||
cfg_dir = standardize_path(self.args_cmd.save_dir, check_write=True)
|
||||
else:
|
||||
cfg_dir = self.args_cmd.load_dir
|
||||
cfg_dir = standardize_path(self.args_cmd.load_dir, check_read=True)
|
||||
llama_args_path = os.path.join(cfg_dir, "config.json")
|
||||
with open(llama_args_path) as f:
|
||||
self.args = json.load(f)
|
||||
@@ -501,7 +502,7 @@ class HuggingfaceModel(ModelBase):
|
||||
self.args.save_lora_to_hf = self.args_cmd.save_lora_to_hf
|
||||
self.args.noop_layers = self.args_cmd.noop_layers
|
||||
|
||||
def get_modules_from_config(self, device_map="cpu", trust_remote_code=True):
|
||||
def get_modules_from_config(self, device_map="cpu", trust_remote_code=False):
|
||||
# Load Huggingface model.
|
||||
if self.args_cmd.save_model_type == "hf":
|
||||
load_dir = self.args_cmd.save_dir
|
||||
@@ -513,9 +514,9 @@ class HuggingfaceModel(ModelBase):
|
||||
hf_model.to_empty(device=device_map)
|
||||
self.module = [hf_model]
|
||||
if hasattr(self.args, "torch_dtype") and self.args.torch_dtype in ["float16", "bfloat16"]:
|
||||
self.module[0] = self.module[0].to(eval(f'torch.{self.args.torch_dtype}'))
|
||||
self.module[0] = self.module[0].to(ast.literal_eval(f'torch.{self.args.torch_dtype}'))
|
||||
|
||||
def get_modules_from_pretrained(self, device_map="cpu", trust_remote_code=True):
|
||||
def get_modules_from_pretrained(self, device_map="cpu", trust_remote_code=False):
|
||||
# Load Huggingface model.
|
||||
if self.args_cmd.save_model_type == "hf":
|
||||
load_dir = self.args_cmd.save_dir
|
||||
@@ -542,7 +543,8 @@ class HuggingfaceModel(ModelBase):
|
||||
)
|
||||
self.module = [get_peft_model(self.module[0], lora_config)]
|
||||
if hasattr(self.args, "torch_dtype") and self.args.torch_dtype in ["float16", "bfloat16"]:
|
||||
self.module[0] = self.module[0].to(eval(f'torch.{self.args.torch_dtype}'))
|
||||
dtype = getattr(torch, self.args.torch_dtype)
|
||||
self.module[0] = self.module[0].to(dtype)
|
||||
|
||||
def get_lora_key(self, layer_name, prefix):
|
||||
return f"{layer_name}.{prefix}"
|
||||
|
||||
@@ -16,7 +16,7 @@ logger.getLogger().setLevel(logger.INFO)
|
||||
|
||||
def load_data(file_path):
|
||||
try:
|
||||
data = torch.load(file_path, map_location='cpu', weights_only=False)
|
||||
data = torch.load(file_path, map_location='cpu', weights_only=True)
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.info(f"Error while loading file '{file_path}': {e}")
|
||||
@@ -74,7 +74,7 @@ class OptimBaseProcessor(abc.ABC):
|
||||
@staticmethod
|
||||
def check_mkdir(dir_path):
|
||||
if not os.path.exists(dir_path):
|
||||
os.makedirs(dir_path)
|
||||
os.makedirs(dir_path, mode=0o750, exist_ok=True)
|
||||
|
||||
def get_ckpt_path(self, tp_rank, pp_rank, ep_rank=None, suffix=""):
|
||||
"""
|
||||
@@ -317,7 +317,7 @@ class OptimSourceProcessor(OptimBaseProcessor):
|
||||
|
||||
@staticmethod
|
||||
def make_param_index_map(model_path):
|
||||
weights = torch.load(model_path, map_location=torch.device('cpu'), weights_only=False)
|
||||
weights = torch.load(model_path, map_location=torch.device('cpu'), weights_only=True)
|
||||
|
||||
# Count the number of models in the checkpoint
|
||||
model_num = sum([1 if key.startswith("model") else 0 for key in weights.keys()])
|
||||
@@ -478,7 +478,7 @@ class OptimSourceProcessor(OptimBaseProcessor):
|
||||
optim_path = self.optimizer_paths[tp_rank][pp_rank][ep_rank]
|
||||
logger.info(f"Splitting from {optim_path} ...")
|
||||
|
||||
merged_ckpt = torch.load(optim_path, map_location="cpu", weights_only=False)
|
||||
merged_ckpt = torch.load(optim_path, map_location="cpu", weights_only=True)
|
||||
if isinstance(merged_ckpt, dict):
|
||||
merged_ckpt = [merged_ckpt]
|
||||
|
||||
@@ -643,7 +643,7 @@ class OptimTargetProcessor(OptimBaseProcessor):
|
||||
for key in ["param", "exp_avg", "exp_avg_sq"]:
|
||||
load_path = f"{ckpt_name}_{key}{ckpt_ext}"
|
||||
logger.info(f" {key} is loaded from {load_path}.")
|
||||
optim_ckpt = torch.load(load_path, map_location="cpu", weights_only=False)
|
||||
optim_ckpt = torch.load(load_path, map_location="cpu", weights_only=True)
|
||||
|
||||
flatten_ckpt = self.flatten_optimizer_ckpt(optim_ckpt, pp_rank, key)
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ from collections import defaultdict, OrderedDict
|
||||
|
||||
from tqdm import tqdm
|
||||
import torch
|
||||
|
||||
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
|
||||
logger.basicConfig(format="")
|
||||
logger.getLogger().setLevel(logger.INFO)
|
||||
|
||||
@@ -39,7 +39,7 @@ class OptimConverter(abc.ABC):
|
||||
|
||||
def get_optim_param_from_src_model_ckpt(self):
|
||||
ckpt_path = self.src_optim.model_paths[0][0][0]
|
||||
model_ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=False)
|
||||
model_ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True)
|
||||
self.optim_param = model_ckpt['optimizer']
|
||||
self.opt_param_scheduler = model_ckpt['opt_param_scheduler']
|
||||
|
||||
@@ -57,7 +57,7 @@ class OptimConverter(abc.ABC):
|
||||
bool: True if successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
model_ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=False)
|
||||
model_ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True)
|
||||
|
||||
# Apply modifications
|
||||
for key, value in modifications.items():
|
||||
|
||||
@@ -539,9 +539,9 @@ def save_huggingface(args, model):
|
||||
from .models import get_huggingface_model
|
||||
model_hf = get_huggingface_model(args)
|
||||
if args.load_hf_from_config:
|
||||
model_hf.get_modules_from_config()
|
||||
model_hf.get_modules_from_config(trust_remote_code=args.trust_remote_code)
|
||||
else:
|
||||
model_hf.get_modules_from_pretrained()
|
||||
model_hf.get_modules_from_pretrained(trust_remote_code=args.trust_remote_code)
|
||||
args_cmd = model_hf.get_args_cmd()
|
||||
|
||||
model_hf.update_module(model)
|
||||
|
||||
@@ -34,7 +34,7 @@ from mindspeed_llm.tasks.evaluation.eval_utils.agi_utils import (
|
||||
get_default_instruction,
|
||||
get_pred_postprocess_func
|
||||
)
|
||||
|
||||
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -42,7 +42,7 @@ class AGIEvalExam(DatasetEval):
|
||||
def __init__(self, test_dir, eval_args,
|
||||
instruction_template="{fewshot_template} {question}\n{question_template}\n{options}"
|
||||
"\n{answer_template}"):
|
||||
self.test_dir = test_dir
|
||||
self.test_dir = standardize_path(test_dir, check_read=True)
|
||||
self.instruction_template = instruction_template
|
||||
self.batch_size = eval_args.evaluation_batch_size
|
||||
self.rank = dist.get_rank()
|
||||
|
||||
@@ -32,7 +32,7 @@ from mindspeed_llm.tasks.evaluation.eval_api.dataset_eval import DatasetEval
|
||||
from mindspeed_llm.tasks.evaluation.eval_impl.template import BBH_TEMPLATE_DIR, BBH_COT_TEMPLATE_DIR, get_eval_template
|
||||
from mindspeed_llm.tasks.evaluation.eval_utils.bbh_utils import bbh_mcq_postprocess, bbh_freeform_postprocess, bbh_true_or_false_questions
|
||||
from mindspeed_llm.tasks.evaluation.utils import get_final_list_dataset
|
||||
|
||||
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -75,7 +75,7 @@ bbh_free_form_sets = [
|
||||
class BBHEval(DatasetEval):
|
||||
def __init__(self, test_dir, eval_args,
|
||||
instruction_template="{fewshot_template}Q: {question}\nA:"):
|
||||
self.test_dir = test_dir
|
||||
self.test_dir = standardize_path(test_dir, check_read=True)
|
||||
self.instruction_template = instruction_template
|
||||
self.batch_size = eval_args.evaluation_batch_size
|
||||
self.rank = dist.get_rank()
|
||||
|
||||
@@ -28,7 +28,7 @@ from mindspeed_llm.tasks.evaluation.eval_api.chat import Chat
|
||||
from mindspeed_llm.tasks.utils.error_utils import check_divisible_by_zero
|
||||
from mindspeed_llm.tasks.evaluation.utils import get_final_list_dataset
|
||||
from mindspeed_llm.tasks.evaluation.eval_utils.boolq_utils import first_capital_postprocess
|
||||
|
||||
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -36,7 +36,7 @@ logger = logging.getLogger(__name__)
|
||||
class BoolqEval(DatasetEval):
|
||||
def __init__(self, test_dir, eval_args,
|
||||
instruction_template="{passage}\nQuestion: {question}?\nAnswer:"):
|
||||
self.test_dir = test_dir
|
||||
self.test_dir = standardize_path(test_dir, check_read=True)
|
||||
self.instruction_template = instruction_template
|
||||
self.alternative_prompt = "{title} -- {passage}\nQuestion: {question}\nA. Yes\nB. No\nAnswer:"
|
||||
self.answer_reference = {'True': 'A', 'False': 'B', 'Yes': 'A', 'No': 'B', 'Y': 'A', 'N': 'B', 'T': 'A', 'F': 'B'}
|
||||
|
||||
@@ -31,7 +31,7 @@ from mindspeed_llm.tasks.utils.error_utils import check_divisible_by_zero
|
||||
from mindspeed_llm.tasks.evaluation.eval_utils.ceval_utils import format_ceval_templates, first_capital_postprocess
|
||||
from mindspeed_llm.tasks.evaluation.utils import get_final_dataset
|
||||
from mindspeed_llm.tasks.evaluation.eval_impl.template import CEVAL_TEMPLATE_DIR, get_eval_template
|
||||
|
||||
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -39,7 +39,7 @@ logger = logging.getLogger(__name__)
|
||||
class CEvalExam(DatasetEval):
|
||||
def __init__(self, test_dir, eval_args,
|
||||
instruction_template="{fewshot_template}\n\n问:{question}\n答:"):
|
||||
self.test_dir = test_dir
|
||||
self.test_dir = standardize_path(test_dir, check_read=True)
|
||||
self.instruction_template = instruction_template
|
||||
self.batch_size = eval_args.evaluation_batch_size
|
||||
self.rank = dist.get_rank()
|
||||
|
||||
@@ -31,9 +31,8 @@ from mindspeed_llm.tasks.evaluation.eval_api.chat import Chat
|
||||
from mindspeed_llm.tasks.utils.error_utils import check_divisible_by_zero
|
||||
from mindspeed_llm.tasks.evaluation.eval_utils.cmmlu_utils import cmmlu_subject_mapping, first_option_postprocess, cmmlu_format_example
|
||||
from mindspeed_llm.tasks.evaluation.utils import get_final_dataset
|
||||
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
|
||||
from .template import CMMLU_TEMPLATE_DIR, get_eval_template
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -43,7 +42,7 @@ class CmmluEval(DatasetEval):
|
||||
"{question}\n答案: ",
|
||||
output_template1=r".*(?P<答案>[A|B|C|D])\..*",
|
||||
output_template2=r"(?P<答案>[A|B|C|D])"):
|
||||
self.test_dir = test_dir
|
||||
self.test_dir = standardize_path(test_dir, check_read=True)
|
||||
self.instruction_template = instruction_template
|
||||
self.output_template = [output_template1, output_template2]
|
||||
self.batch_size = eval_args.evaluation_batch_size
|
||||
|
||||
@@ -30,7 +30,7 @@ from mindspeed_llm.tasks.utils.error_utils import check_divisible_by_zero
|
||||
from mindspeed_llm.tasks.evaluation.eval_utils.gsm8k_utils import four_shots_prompt, gsm8k_postprocess
|
||||
from mindspeed_llm.tasks.evaluation.utils import get_final_list_dataset
|
||||
from mindspeed_llm.tasks.evaluation.eval_impl.template import GSM8K_TEMPLATE_DIR
|
||||
|
||||
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -38,7 +38,7 @@ class Gsm8kEval(DatasetEval):
|
||||
def __init__(self, test_dir, eval_args,
|
||||
instruction_template="{fewshot_template}\n\n{question}",
|
||||
output_template=r'The answer is (.*?) '):
|
||||
self.test_dir = test_dir
|
||||
self.test_dir = standardize_path(test_dir, check_read=True)
|
||||
self.instruction_template = instruction_template
|
||||
self.output_template = output_template
|
||||
self.batch_size = eval_args.evaluation_batch_size
|
||||
|
||||
@@ -18,7 +18,7 @@ from mindspeed_llm.tasks.evaluation.eval_api.chat import Chat
|
||||
from mindspeed_llm.tasks.utils.error_utils import check_divisible_by_zero
|
||||
from mindspeed_llm.tasks.evaluation.eval_utils.mmlu_utils import postprocess
|
||||
from mindspeed_llm.tasks.evaluation.utils import get_final_list_dataset
|
||||
|
||||
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -27,7 +27,7 @@ class HellaswagEval(DatasetEval):
|
||||
def __init__(self, test_dir, eval_args,
|
||||
output_template1=r".*(?P<answer>[A|B|C|D])\..*",
|
||||
output_template2=r"(?P<answer>[A|B|C|D])"):
|
||||
self.test_dir = test_dir
|
||||
self.test_dir = standardize_path(test_dir, check_read=True)
|
||||
self.output_template = [output_template1, output_template2]
|
||||
self.instruction_template = ('{ctx}\nQuestion: Which ending makes the most sense?\n'
|
||||
'A. {A}\nB. {B}\nC. {C}\nD. {D}\n'
|
||||
|
||||
@@ -19,6 +19,7 @@ import logging
|
||||
import re
|
||||
import sys
|
||||
import subprocess
|
||||
import ast
|
||||
from typing import Iterable, Dict
|
||||
import pandas as pd
|
||||
import tqdm
|
||||
@@ -32,17 +33,57 @@ from mindspeed_llm.tasks.evaluation.eval_api.dataset_eval import DatasetEval
|
||||
from mindspeed_llm.tasks.evaluation.eval_api.chat import Chat
|
||||
from mindspeed_llm.tasks.utils.error_utils import check_divisible_by_zero
|
||||
from mindspeed_llm.training.utils import WRITE_FILE_DEFAULT_FLAGS, WRITE_FILE_DEFAULT_MODES
|
||||
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
|
||||
from mindspeed_llm.tasks.evaluation.eval_utils.human_utils import humaneval_postprocess, get_score
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def is_code_dangerous(code: str, dangerous_patterns) -> bool:
|
||||
"""AST 检测提权、外联、文件篡改"""
|
||||
|
||||
# 正则检测(快速过滤)
|
||||
for pattern in dangerous_patterns:
|
||||
if re.search(pattern, code):
|
||||
return True
|
||||
|
||||
# AST 语义分析(防绕过)
|
||||
try:
|
||||
tree = ast.parse(code)
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.Call):
|
||||
if isinstance(node.func, ast.Name):
|
||||
if node.func.id in ("exec", "eval", "open", "os", "subprocess"):
|
||||
return True
|
||||
elif isinstance(node, (ast.Import, ast.ImportFrom)):
|
||||
for alias in node.names:
|
||||
if alias.name in ("os", "sys", "subprocess"):
|
||||
return True
|
||||
|
||||
# 检测 os.system("sudo ...")
|
||||
if isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute) and node.func.attr == "system":
|
||||
return True
|
||||
if any(isinstance(arg, ast.Str) and ("sudo" in arg.s or "curl" in arg.s) for arg in node.args):
|
||||
return True
|
||||
# 检测动态导入(如 __import__("os").system(...))
|
||||
if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == "__import__":
|
||||
return True
|
||||
|
||||
return False
|
||||
except SyntaxError:
|
||||
return True # 语法错误视为危险
|
||||
|
||||
|
||||
def extract_answer_code(answer, task: dict):
|
||||
"""
|
||||
:param answer:
|
||||
:param task:
|
||||
:return:
|
||||
"""
|
||||
"""安全生成测试文件"""
|
||||
if is_code_dangerous(answer, self.dangerous_patterns) or is_code_dangerous(task["test"], self.dangerous_patterns):
|
||||
raise ValueError("Unsafe code detected")
|
||||
|
||||
task_id = task['task_id']
|
||||
target_func = task['entry_point']
|
||||
test_case = task['test']
|
||||
@@ -51,7 +92,7 @@ def extract_answer_code(answer, task: dict):
|
||||
code_lines = code.split("\n")
|
||||
target_func_flag = False
|
||||
if not os.path.exists(CODE_TEST_LOG_DIR):
|
||||
os.makedirs(CODE_TEST_LOG_DIR)
|
||||
os.makedirs(CODE_TEST_LOG_DIR, mode=0o750, exist_ok=True)
|
||||
test_code_path = "{}/{}".format(CODE_TEST_LOG_DIR, save_file)
|
||||
with os.fdopen(os.open(test_code_path, WRITE_FILE_DEFAULT_FLAGS, WRITE_FILE_DEFAULT_MODES), 'w') as f:
|
||||
f.write("from typing import List\n")
|
||||
@@ -85,7 +126,7 @@ def extract_answer_code(answer, task: dict):
|
||||
|
||||
class HumanEval(DatasetEval):
|
||||
def __init__(self, test_dir, eval_args):
|
||||
self.test_dir = test_dir
|
||||
self.test_dir = standardize_path(test_dir, check_read=True)
|
||||
instruction_template = eval_args.instruction_template
|
||||
if instruction_template:
|
||||
self.instruction_template = instruction_template
|
||||
@@ -96,6 +137,11 @@ class HumanEval(DatasetEval):
|
||||
self.file_pbar = None
|
||||
self.task_pbar = None
|
||||
self.prompt = 'Complete the following python code:\n{prompt}'
|
||||
self.dangerous_patterns = []
|
||||
with open("configs/dangerous_shell.json", "r", encoding="utf-8") as f:
|
||||
self.dangerous_patterns = json.load(f)
|
||||
print(self.dangerous_patterns)
|
||||
|
||||
|
||||
def read_problems(self) -> Dict[str, Dict]:
|
||||
return {task["task_id"]: task for task in self.stream_jsonl(self.test_dir)}
|
||||
|
||||
@@ -30,6 +30,7 @@ from mindspeed_llm.tasks.evaluation.eval_api.chat import Chat
|
||||
from mindspeed_llm.tasks.utils.error_utils import check_divisible_by_zero
|
||||
from mindspeed_llm.tasks.evaluation.eval_utils.mmlu_utils import _format_example, postprocess
|
||||
from mindspeed_llm.tasks.evaluation.utils import get_final_dataset
|
||||
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
|
||||
from .template import MMLU_TEMPLATE_DIR, get_eval_template
|
||||
|
||||
|
||||
@@ -42,7 +43,7 @@ class MmluEval(DatasetEval):
|
||||
"{question}\nAnswer:",
|
||||
output_template1=r".*(?P<answer>[A|B|C|D])\..*",
|
||||
output_template2=r"(?P<answer>[A|B|C|D])"):
|
||||
self.test_dir = test_dir
|
||||
self.test_dir = standardize_path(test_dir, check_read=True)
|
||||
self.instruction_template = instruction_template
|
||||
self.output_template = [output_template1, output_template2]
|
||||
self.batch_size = eval_args.evaluation_batch_size
|
||||
|
||||
@@ -157,7 +157,7 @@ def load_datasets(config: NeedleBenchConfig):
|
||||
dataset = []
|
||||
needle_file_path = os.path.join(config.dataset_dir, "needles.jsonl")
|
||||
cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache")
|
||||
os.makedirs(cache_dir, exist_ok=True)
|
||||
os.makedirs(cache_dir, mode=0o750, exist_ok=True)
|
||||
cache_file = "9b5ad71b2ce5302211f9c61530b329a4922fc6a4"
|
||||
dest_filepath = os.path.join(cache_dir, cache_file)
|
||||
# 参考docs中的needlebench-evaluation.md下载cl100k_base.tiktoken,将其放置到该数据集文件夹
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
|
||||
from functools import reduce
|
||||
import re
|
||||
import os
|
||||
import stat
|
||||
|
||||
@@ -52,17 +53,34 @@ def safe_open(file_path: str, mode='r', encoding=None, permission_mode=0o600, is
|
||||
mode, encoding=encoding)
|
||||
|
||||
|
||||
def standardize_path(path: str, max_path_length=MAX_PATH_LENGTH, check_link=True):
|
||||
def standardize_path(path: str, max_path_length=MAX_PATH_LENGTH, check_link=True, check_read=True, check_write=True):
|
||||
"""
|
||||
check path
|
||||
param: path
|
||||
return: data real path after check
|
||||
"""
|
||||
check_path_is_none(path)
|
||||
if path:
|
||||
path = os.path.realpath(path)
|
||||
else:
|
||||
return None
|
||||
|
||||
if os.path.exists(path):
|
||||
if check_read and not os.access(path, os.R_OK):
|
||||
raise RuntimeError(f"File {path} not readable")
|
||||
|
||||
if check_write and not os.access(path, os.W_OK):
|
||||
raise RuntimeError(f"File {path} not writable")
|
||||
else:
|
||||
print(f"Path: {path} not exists")
|
||||
|
||||
check_path_length_lt(path, max_path_length)
|
||||
if check_link:
|
||||
check_path_is_link(path)
|
||||
path = os.path.realpath(path)
|
||||
|
||||
pattern = r'(\.|/|_|-|\s|[~0-9a-zA-Z]|[\u4e00-\u9fa5])+'
|
||||
if not re.fullmatch(pattern, path):
|
||||
raise RuntimeError(f"Invalid input path: {path}")
|
||||
|
||||
return path
|
||||
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ try:
|
||||
from mindspeed.core.pipeline_parallel.fb_overlap.modules.attention import launch_async_all2all
|
||||
from mindspeed.core.pipeline_parallel.dualpipev.dualpipev_schedules import get_post_process_flag
|
||||
except ImportError:
|
||||
pass
|
||||
print("[warning] failed import dualpipe modules, not support dualpipe")
|
||||
from mindspeed.core.transformer.moe.comm_utils import async_all_gather
|
||||
from mindspeed.core.tensor_parallel.random import CheckpointWithoutOutput
|
||||
from megatron.training.utils import get_args
|
||||
|
||||
@@ -2,7 +2,7 @@ import argparse
|
||||
import gc
|
||||
import json
|
||||
import re
|
||||
|
||||
import os
|
||||
import jsonlines
|
||||
import pandas as pd
|
||||
import torch
|
||||
@@ -13,6 +13,7 @@ from vllm.distributed.parallel_state import (destroy_distributed_environment, de
|
||||
|
||||
from utils import blending_datasets, PromptGtAnswerDataset, apply_GenRM_template, rejection_sampling_processor
|
||||
from mindspeed_llm.tasks.posttrain.verifier.rule_verifier import preprocess_box_response_for_qwen_prompt
|
||||
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
|
||||
|
||||
|
||||
def clean_up():
|
||||
@@ -36,13 +37,13 @@ def batch_generate_vllm(args):
|
||||
dummy_strategy.args = args
|
||||
|
||||
# configure tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.pretrain, trust_remote_code=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.pretrain, trust_remote_code=args.trust_remote_code)
|
||||
|
||||
# configure model
|
||||
llm = LLM(
|
||||
model=args.pretrain,
|
||||
tensor_parallel_size=args.tp_size,
|
||||
trust_remote_code=True,
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
seed=args.seed,
|
||||
max_num_seqs=args.max_num_seqs,
|
||||
enable_prefix_caching=args.enable_prefix_caching,
|
||||
@@ -107,7 +108,7 @@ def batch_GenRM_rejection_sampling(args):
|
||||
llm = LLM(
|
||||
model=args.pretrain,
|
||||
tensor_parallel_size=args.tp_size,
|
||||
trust_remote_code=True,
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
seed=args.seed,
|
||||
max_num_seqs=args.max_num_seqs,
|
||||
enable_prefix_caching=args.enable_prefix_caching,
|
||||
@@ -215,9 +216,15 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--iter", type=int, default=None,
|
||||
help="Used to slice the datasets in range iter * rollout_batch_size: (iter + 1) * rollout_batch_size", )
|
||||
parser.add_argument("--rollout-batch-size", type=int, default=2048, help="Number of samples to generate")
|
||||
parser.add_argument('--trust-remote-code',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='enable trust-remote-code for transformer to load model')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
args.output_path = standardize_path(args.output_path, check_write=True)
|
||||
|
||||
if args.task and args.task == "generate_vllm":
|
||||
batch_generate_vllm(args)
|
||||
elif args.task and args.task == "rejection_sampling":
|
||||
|
||||
@@ -31,7 +31,7 @@ def blending_datasets(
|
||||
ext = os.path.splitext(dataset)[-1]
|
||||
# local python script
|
||||
if ext == ".py" or (os.path.isdir(dataset) and os.path.exists(os.path.join(dataset, f"{dataset_basename}.py"))):
|
||||
data = load_dataset(dataset, trust_remote_code=True)
|
||||
data = load_dataset(dataset, trust_remote_code=False)
|
||||
strategy.print(f"loaded {dataset} with python script")
|
||||
# local text file
|
||||
elif ext in [".json", ".jsonl", ".csv"]:
|
||||
|
||||
@@ -13,7 +13,7 @@ from megatron.training import get_timers
|
||||
try:
|
||||
from mindspeed.core.pipeline_parallel.dualpipev.dualpipev_schedules import set_post_process_flag
|
||||
except ImportError:
|
||||
pass
|
||||
print("[warning] failed import dualpipe modules, not support dualpipe")
|
||||
from mindspeed_llm.training.utils import get_tune_attention_mask, get_finetune_data_on_this_tp_rank, generate_actual_seq_len
|
||||
from mindspeed_llm.tasks.posttrain.base import BaseTrainer
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from typing import Union
|
||||
|
||||
import os
|
||||
import torch
|
||||
|
||||
from megatron.core import mpu, dist_checkpointing
|
||||
@@ -19,6 +19,7 @@ from megatron.training.training import compute_throughputs_and_append_to_progres
|
||||
from megatron.training.utils import unwrap_model, print_rank_0, append_to_progress_log
|
||||
from megatron.training.yaml_arguments import core_transformer_config_from_yaml
|
||||
from mindspeed_llm.tasks.posttrain.orm.orm_model import GPTRewardModel
|
||||
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
|
||||
|
||||
|
||||
def model_provider(is_reward_model=False, pre_process=True, post_process=True) -> Union[GPTModel]:
|
||||
@@ -137,6 +138,8 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler,
|
||||
if save_model_type:
|
||||
save_path = args.save + '/' + save_model_type
|
||||
|
||||
save_path = standardize_path(save_path, check_write=True)
|
||||
|
||||
ckpt_format = args.dist_ckpt_format if args.use_dist_ckpt else 'torch'
|
||||
print_rank_0('saving checkpoint at iteration {:7d} to {} in {} format'.format(
|
||||
iteration, save_path, ckpt_format))
|
||||
|
||||
@@ -38,7 +38,7 @@ def parse_digits(num):
|
||||
try:
|
||||
return float(num) / 100
|
||||
except:
|
||||
pass
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import random
|
||||
import re
|
||||
from typing import TypeVar, Iterable, List, Union, Any, Dict
|
||||
|
||||
import ast
|
||||
import regex
|
||||
import sympy
|
||||
from latex2sympy2 import latex2sympy
|
||||
@@ -75,7 +75,7 @@ def convert_word_number(text: str) -> str:
|
||||
try:
|
||||
text = str(w2n.word_to_num(text))
|
||||
except ValueError:
|
||||
pass
|
||||
return None
|
||||
return text
|
||||
|
||||
|
||||
@@ -468,7 +468,7 @@ def extract_theoremqa_answer(pred: str, answer_flag: bool = True):
|
||||
pred = clean_units(pred)
|
||||
try:
|
||||
tmp = str(latex2sympy(pred))
|
||||
pred = str(eval(tmp))
|
||||
pred = str(ast.literal_eval(tmp))
|
||||
except Exception:
|
||||
if re.match(r"-?[\d\.]+\s\D+$", pred):
|
||||
pred = pred.split(" ")[0]
|
||||
|
||||
@@ -21,16 +21,14 @@ def load_jsonl(file: Union[str, Path]) -> Iterable[Any]:
|
||||
yield json.loads(line)
|
||||
except json.JSONDecodeError as e:
|
||||
print("Error in loading JSON:", line, "Error:", e)
|
||||
pass
|
||||
except Exception as e:
|
||||
print("Unexpected error in loading:", line, "Error:", e)
|
||||
pass
|
||||
|
||||
|
||||
def save_jsonl(samples, save_path):
|
||||
# ensure path
|
||||
folder = os.path.dirname(save_path)
|
||||
os.makedirs(folder, exist_ok=True)
|
||||
os.makedirs(folder, mode=0o750, exist_ok=True)
|
||||
|
||||
with open(save_path, "w", encoding="utf-8") as f:
|
||||
for sample in samples:
|
||||
|
||||
@@ -1034,6 +1034,7 @@ def build_dataset(args):
|
||||
# for MOSS, streaming is needed.
|
||||
args.streaming = True
|
||||
if args.hf_datasets_params:
|
||||
args.hf_datasets_params = standardize_path(args.hf_datasets_params, check_read=True)
|
||||
with open(args.hf_datasets_params, 'r') as fin:
|
||||
param_dict = json.load(fin)
|
||||
return load_dataset(**param_dict)
|
||||
|
||||
@@ -190,7 +190,8 @@ class ToolFormatter(Formatter):
|
||||
return [default_tool_formatter(tools)]
|
||||
else:
|
||||
raise NotImplementedError
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
print(f"[warning] Unexpected error processing content: {content}. Error: {e}")
|
||||
return [""]
|
||||
|
||||
def extract(self, content: str) -> Union[str, Tuple[str, str]]:
|
||||
|
||||
@@ -23,7 +23,7 @@ from datasets import load_dataset, concatenate_datasets, interleave_datasets
|
||||
|
||||
from mindspeed_llm.tasks.preprocess.templates import Role
|
||||
from mindspeed_llm.tasks.preprocess.parser import InstructionDatasetAttr
|
||||
|
||||
from mindspeed_llm.tasks.evaluation.file_utils import standardize_path
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -103,6 +103,7 @@ def get_dataset_list(data_args) -> List["InstructionDatasetAttr"]:
|
||||
else:
|
||||
dataset_names = []
|
||||
|
||||
data_args.dataset_dir = standardize_path(data_args.dataset_dir, check_read=True)
|
||||
try:
|
||||
with open(os.path.join(data_args.dataset_dir, DATA_CONFIG), "r") as f:
|
||||
dataset_info = json.load(f)
|
||||
|
||||
@@ -820,6 +820,10 @@ def _add_training_args(parser):
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='enable deterministic computing for npu')
|
||||
group.add_argument('--trust-remote-code',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='enable trust-remote-code for transformer to load model')
|
||||
group.add_argument('--jit-compile', action='store_true', default=False,
|
||||
help='Setting jit compile mode to True')
|
||||
group.add_argument('--prompt-type', type=str, default=None,
|
||||
|
||||
@@ -123,7 +123,7 @@ def _load_base_checkpoint(load_dir, rank0=False, sharded_state_dict=None,
|
||||
return state_dict, checkpoint_name, release
|
||||
|
||||
try:
|
||||
state_dict = torch.load(checkpoint_name, map_location='cpu', weights_only=False)
|
||||
state_dict = torch.load(checkpoint_name, map_location='cpu', weights_only=True)
|
||||
except ModuleNotFoundError:
|
||||
from megatron.legacy.fp16_deprecated import loss_scaler
|
||||
# For backward compatibility.
|
||||
@@ -134,7 +134,7 @@ def _load_base_checkpoint(load_dir, rank0=False, sharded_state_dict=None,
|
||||
sys.modules['megatron.fp16.loss_scaler'] = sys.modules[
|
||||
'megatron.legacy.fp16_deprecated.loss_scaler']
|
||||
sys.modules['megatron.model'] = sys.modules['megatron.legacy.model']
|
||||
state_dict = torch.load(checkpoint_name, map_location='cpu', weights_only=False)
|
||||
state_dict = torch.load(checkpoint_name, map_location='cpu', weights_only=True)
|
||||
sys.modules.pop('fp16.loss_scaler', None)
|
||||
sys.modules.pop('megatron.fp16.loss_scaler', None)
|
||||
sys.modules.pop('megatron.model', None)
|
||||
|
||||
@@ -49,6 +49,7 @@ def build_tokenizer(args):
|
||||
model_max_length=args.seq_length,
|
||||
use_fast=args.tokenizer_not_use_fast,
|
||||
prompt_type=args.prompt_type,
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
**hf_tokenizer_kwargs
|
||||
)
|
||||
|
||||
@@ -108,7 +109,7 @@ class TokenizerAdaptor:
|
||||
class _AutoTokenizer(MegatronTokenizer):
|
||||
"""AutoTokenizer for Hf Pretrained model loading."""
|
||||
|
||||
def __init__(self, tokenizer_name_or_path, vocab_extra_ids, model_max_length, use_fast, prompt_type=None, **kwargs):
|
||||
def __init__(self, tokenizer_name_or_path, vocab_extra_ids, model_max_length, use_fast, prompt_type=None, trust_remote_code=False, **kwargs):
|
||||
name = tokenizer_name_or_path
|
||||
super().__init__(name)
|
||||
hf_tokenizer_kwargs = kwargs
|
||||
@@ -117,7 +118,7 @@ class _AutoTokenizer(MegatronTokenizer):
|
||||
|
||||
hf_tokenizer_kwargs["model_max_length"] = model_max_length
|
||||
hf_tokenizer_kwargs["use_fast"] = use_fast
|
||||
hf_tokenizer_kwargs["trust_remote_code"] = True
|
||||
hf_tokenizer_kwargs["trust_remote_code"] = trust_remote_code
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, **hf_tokenizer_kwargs, local_files_only=True)
|
||||
if (prompt_type is None) and (self.tokenizer.pad_token_id is None):
|
||||
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
|
||||
|
||||
@@ -195,6 +195,10 @@ def add_tokenizer_args(parser):
|
||||
default=[],
|
||||
help="The labels represent the correctness of each reasoning step in the entire reasoning process.",
|
||||
)
|
||||
parser.add_argument('--trust-remote-code',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='enable trust-remote-code for transformer to load model')
|
||||
|
||||
|
||||
def add_output_args(parser):
|
||||
|
||||
@@ -100,6 +100,7 @@ ROPE_ARGS="
|
||||
|
||||
|
||||
GPT_ARGS="
|
||||
--trust-remote-code \
|
||||
--no-gradient-accumulation-fusion \
|
||||
--spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \
|
||||
--reset-position-ids \
|
||||
|
||||
@@ -46,6 +46,7 @@ MLA_ARGS="
|
||||
|
||||
|
||||
MOE_ARGS="
|
||||
--trust-remote-code \
|
||||
--moe-grouped-gemm \
|
||||
--moe-permutation-async-comm \
|
||||
--use-fused-moe-token-permute-and-unpermute \
|
||||
|
||||
@@ -34,6 +34,7 @@ DISTRIBUTED_ARGS="
|
||||
"
|
||||
|
||||
GPT_ARGS="
|
||||
--trust-remote-code \
|
||||
--tensor-model-parallel-size ${TP} \
|
||||
--pipeline-model-parallel-size ${PP} \
|
||||
--sequence-parallel \
|
||||
|
||||
@@ -31,6 +31,7 @@ DISTRIBUTED_ARGS="
|
||||
"
|
||||
|
||||
GPT_ARGS="
|
||||
--trust-remote-code \
|
||||
--use-mcore-models \
|
||||
--transformer-impl local \
|
||||
--tensor-model-parallel-size ${TP} \
|
||||
|
||||
@@ -97,7 +97,7 @@ def compare_state_dicts(state_dict1, state_dict2):
|
||||
|
||||
|
||||
def process_file(file_path):
|
||||
data = torch.load(file_path, map_location='cpu', weights_only=False)
|
||||
data = torch.load(file_path, map_location='cpu', weights_only=True)
|
||||
layer_ckpt = {}
|
||||
# 兼容带vpp的权重
|
||||
for key in data.keys():
|
||||
@@ -164,8 +164,8 @@ def weight_compare(dir_1, dir_2, suffix="pt", use_md5=False):
|
||||
if use_md5:
|
||||
are_equal = (get_md5sum(path_1) == get_md5sum(path_2))
|
||||
else:
|
||||
state_dict1 = torch.load(path_1, weights_only=False)
|
||||
state_dict2 = torch.load(path_2, weights_only=False)
|
||||
state_dict1 = torch.load(path_1, weights_only=True)
|
||||
state_dict2 = torch.load(path_2, weights_only=True)
|
||||
are_equal = compare_state_dicts(state_dict1, state_dict2)
|
||||
if not are_equal:
|
||||
return False
|
||||
@@ -192,8 +192,8 @@ def weight_compare_optim(dir_1, dir_2, suffix="pt", use_md5=False):
|
||||
if use_md5:
|
||||
are_equal = (get_md5sum(path_1) == get_md5sum(path_2))
|
||||
else:
|
||||
state_dict1 = torch.load(path_1, weights_only=False)
|
||||
state_dict2 = torch.load(path_2, weights_only=False)
|
||||
state_dict1 = torch.load(path_1, weights_only=True)
|
||||
state_dict2 = torch.load(path_2, weights_only=True)
|
||||
are_equal = compare_state_dicts(state_dict1, state_dict2)
|
||||
|
||||
if not are_equal:
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
"test_deepseek2_hf2mcore_tp1pp4ep8": [
|
||||
{
|
||||
"param": {
|
||||
"trust-remote-code":null,
|
||||
"model-type":"GPT",
|
||||
"load-model-type":"hf",
|
||||
"save-model-type":"mg",
|
||||
@@ -23,6 +24,7 @@
|
||||
"test_deepseek2_mcore2hf_tp1pp4ep8": [
|
||||
{
|
||||
"param": {
|
||||
"trust-remote-code":null,
|
||||
"model-type":"GPT",
|
||||
"load-model-type":"mg",
|
||||
"save-model-type": "hf",
|
||||
|
||||
Reference in New Issue
Block a user