Add model merger to save checkpoints in the format of .safetensor and push them…

Add model merger to save checkpoints in the format of .safetensor and push them to the huggingface (#262) This PR introduces a new script, `scripts/model_merger.py`, which enables the conversion of model checkpoints saved in `.pt` format to the `.safetensors` format. The script also includes functionality to optionally push the converted model to Hugging Face Hub. ### Changes: 1. Added `scripts/model_merger.py` to handle the conversion process. 2. Implemented support for `.pt` to `.safetensors` transformation. 3. Added an option to push the converted model to Hugging Face Hub if required.

Add model merger to save checkpoints in the format of .safetensor and push them…
Add model merger to save checkpoints in the format of .safetensor and push them to the huggingface (#262) This PR introduces a new script, `scripts/model_merger.py`, which enables the conversion of model checkpoints saved in `.pt` format to the `.safetensors` format. The script also includes functionality to optionally push the converted model to Hugging Face Hub. ### Changes: 1. Added `scripts/model_merger.py` to handle the conversion process. 2. Implemented support for `.pt` to `.safetensors` transformation. 3. Added an option to push the converted model to Hugging Face Hub if required.
1703c341 · Zhihan · GitHub · 58a5c46e · 1703c341
Unverified Commit 1703c341 authored Feb 14, 2025 by Zhihan Committed by GitHub Feb 15, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 170 additions and 0 deletions

scripts/model_merger.py
+170 -0

No files found.
--- a/scripts/model_merger.py
+++ b/scripts/model_merger.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Tuple, Dict
+import re
+import os
+import torch
+import argparse
+from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForTokenClassification
+from concurrent.futures import ThreadPoolExecutor
+from torch.distributed._tensor import DTensor, Shard, Placement
+
+
+def merge_by_placement(tensors: List[torch.Tensor], placement: Placement):
+    if placement.is_replicate():
+        return tensors[0]
+    elif placement.is_partial():
+        raise NotImplementedError("Partial placement is not supported yet")
+    elif placement.is_shard():
+        return torch.cat(tensors, dim=placement.dim).contiguous()
+    else:
+        raise ValueError(f"Unsupported placement: {placement}")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--local_dir', required=True, type = str, help="The path for your saved model")
+    parser.add_argument("--hf_upload_path", default=False, type = str, help="The path of the huggingface repo to upload")
+    args = parser.parse_args()
+
+    assert not args.local_dir.endswith("huggingface"), "The local_dir should not end with huggingface"
+    local_dir = args.local_dir
+
+    # copy rank zero to find the shape of (dp, fsdp)
+    rank = 0
+    world_size = 0
+    for filename in os.listdir(local_dir):
+        match = re.match(r"model_world_size_(\d+)_rank_0\.pt", filename)
+        if match:
+            world_size = match.group(1)  
+            break  
+    assert world_size, "No model file with the proper format"
+        
+    state_dict = torch.load(os.path.join(local_dir, f'model_world_size_{world_size}_rank_{rank}.pt'), map_location='cpu')
+    pivot_key = sorted(list(state_dict.keys()))[0]
+    weight = state_dict[pivot_key]
+    assert isinstance(weight, torch.distributed._tensor.DTensor)
+    # get sharding info
+    device_mesh = weight.device_mesh
+    mesh = device_mesh.mesh
+    mesh_dim_names = device_mesh.mesh_dim_names
+
+    print(f'Got device mesh {mesh}, mesh_dim_names {mesh_dim_names}')
+
+    assert mesh_dim_names in (
+        ('fsdp',),
+    ), f'Unsupported mesh_dim_names {mesh_dim_names}'
+
+    if 'tp' in mesh_dim_names:
+        # fsdp * tp
+        total_shards = mesh.shape[-1] * mesh.shape[-2]
+        mesh_shape = (mesh.shape[-2], mesh.shape[-1])
+    else:
+        # fsdp
+        total_shards = mesh.shape[-1]
+        mesh_shape = (mesh.shape[-1],)
+
+    print(f'Processing model shards with {total_shards} {mesh_shape} in total')
+
+    model_state_dict_lst = []
+    model_state_dict_lst.append(state_dict)
+    model_state_dict_lst.extend([""] * (total_shards - 1))
+
+    def process_one_shard(rank):
+        model_path = os.path.join(local_dir, f'model_world_size_{world_size}_rank_{rank}.pt')
+        state_dict = torch.load(model_path, map_location='cpu', weights_only=False)
+        model_state_dict_lst[rank] = state_dict
+        return state_dict
+
+    with ThreadPoolExecutor(max_workers=min(32, os.cpu_count())) as executor:
+        for rank in range(1, total_shards):
+            executor.submit(process_one_shard, rank)
+    state_dict = {}
+    param_placements: Dict[str, List[Placement]] = {}
+    keys = set(model_state_dict_lst[0].keys())
+    for key in keys:
+        state_dict[key] = []
+        for model_state_dict in model_state_dict_lst:
+            try:
+                tensor = model_state_dict.pop(key)
+            except:
+                print("-"*30)
+                print(model_state_dict)
+            if isinstance(tensor, DTensor):
+                state_dict[key].append(tensor._local_tensor.bfloat16())
+                placements = tuple(tensor.placements)
+                # replicated placement at dp dimension can be discarded
+                if mesh_dim_names[0] == 'dp':
+                    placements = placements[1:]
+                if key not in param_placements:
+                    param_placements[key] = placements
+                else:
+                    assert param_placements[key] == placements
+            else:
+                state_dict[key] = tensor.bfloat16()
+
+    del model_state_dict_lst
+
+    for key in sorted(state_dict):
+        if not isinstance(state_dict[key], list):
+            print(f"No need to merge key {key}")
+            continue
+        # merge shards
+        placements: Tuple[Shard] = param_placements[key]
+        if len(mesh_shape) == 1:
+            # 1-D list, FSDP without TP
+            assert len(placements) == 1
+            shards = state_dict[key]
+            state_dict[key] = merge_by_placement(shards, placements[0])
+        else:
+            # 2-D list, FSDP + TP
+            raise NotImplementedError("FSDP + TP is not supported yet")
+
+    print('Writing to local disk')
+    hf_path = os.path.join(local_dir, 'huggingface')
+    config = AutoConfig.from_pretrained(hf_path)
+
+    if 'ForTokenClassification' in config.architectures[0]:
+        auto_model = AutoModelForTokenClassification
+    elif 'ForCausalLM' in config.architectures[0]:
+        auto_model = AutoModelForCausalLM
+    else:
+        raise NotImplementedError(f'Unknown architecture {config["architectures"]}')
+
+    with torch.device('meta'):
+        model = auto_model.from_config(config, torch_dtype=torch.bfloat16)
+    model.to_empty(device='cpu')
+
+    print(f'Saving model to {hf_path}')
+    model.save_pretrained(hf_path, state_dict=state_dict)
+    del state_dict
+    del model
+    if args.hf_upload_path:
+        # Push to hugging face
+        from huggingface_hub import HfApi
+        api = HfApi()
+        api.create_repo(repo_id=args.hf_upload_path, private=False, exist_ok=True)
+        api.upload_folder(
+            folder_path=hf_path,
+            repo_id=args.hf_upload_path,
+            repo_type="model"
+        )
+    
+    
+
+
+
+
+