Skip to content

Module stoke.configs

Handles all config objects

None

??? example "View Source" # -- coding: utf-8 --

    # Copyright FMR LLC <opensource@fidelity.com>

    # SPDX-License-Identifier: Apache-2.0



    """Handles all config objects"""



    from enum import Enum

    from typing import Dict, Optional, Type



    import attr

    import torch



    try:

        from typing import TypedDict

    except ImportError:

        from mypy_extensions import TypedDict





    class HorovodOps(Enum):

        """Horovod ops options"""



        Average = "Average"

        Sum = "Sum"

        Adasum = "Adasum"





    class OffloadDevice(Enum):

        """Offload device options"""



        none = "none"

        cpu = "cpu"

        nvme = "nvme"





    class BackendOptions(Enum):

        """Communication backend options"""



        nccl = "nccl"

        mpi = " mpi"

        gloo = "gloo"





    @attr.s(auto_attribs=True)

    class AMPConfig:

        """PyTorch AMP configuration class



        Attributes

        ----------

        backoff_factor : float, default: 0.5

            Factor by which the scale is multiplied during update if inf/NaN gradients occur in an iteration

        growth_factor : float, default: 2.0

            Factor by which the scale is multiplied during update if no inf/NaN gradients occur for growth_interval consecutive iterations.

        growth_interval : int, default: 2000

            Number of consecutive iterations without inf/NaN gradients that must occur for the scale to be multiplied by

            growth_factor

        init_scale : float, default: 2.**16

            Initial scale factor



        """



        backoff_factor: float = 0.5

        growth_factor: float = 2.0

        growth_interval: int = 2000

        init_scale: float = 2.0 ** 16





    @attr.s(auto_attribs=True)

    class ApexConfig:

        """Nvidia APEX configuration class



        Attributes

        ----------

        cast_model_outputs: Optional[torch.dtype], default: None

            Option to ensure that the outputs of your model(s) are always cast to a particular type regardless of opt_level

        convert_to_sync_batch_norm: bool, default: False

            Automatically convert all batch norm calls to apex.parallel.SyncBatchNorm calls

            https://nvidia.github.io/apex/parallel.html#apex.parallel.SyncBatchNorm

        max_loss_scale: float, default: 2.**24

            Sets a ceiling for the loss scale values that can be chosen by dynamic loss scaling

        min_loss_scale: Optional[float], default: None

            Sets a floor for the loss scale values that can be chosen by dynamic loss scaling. The default value of None

            means that no floor is imposed

        scaler_per_loss: bool, default: False

            Option to impose a scaler for each loss instead of a global scaler

        verbosity: int, default: 0

            Set to 0 to suppress Amp-related output



        """



        cast_model_outputs: Optional[torch.dtype] = None

        convert_to_sync_batch_norm: bool = False

        max_loss_scale: float = 2.0 ** 24

        min_loss_scale: Optional[float] = None

        scaler_per_loss: bool = False

        verbosity: int = 0





    @attr.s(auto_attribs=True)

    class ClipGradConfig:

        """Gradient clipping by value configuration class



        Attributes

        ----------

        clip_value: float

            maximum allowed absolute value of the gradients [-clip_value, clip_value]



        """



        clip_value: float





    @attr.s(auto_attribs=True)

    class ClipGradNormConfig:

        """Gradient clipping by p-norm configuration class



        Attributes

        ----------

        max_norm: float

            max norm of the gradients

        norm_type: float

            type of the used p-norm



        """



        max_norm: float

        norm_type: float





    @attr.s(auto_attribs=True)

    class DDPConfig:

        """PyTorch DistributedDataParallel configuration class



        Attributes

        ----------

        local_rank: Optional[int]

            Current local rank of the device (provided here, as LOCAL_RANK env var, or parsed from --local_arg)

        auto_mpi_discovery: bool, default: False

            if distributed environment variables are not set, attempt to discover them from MPI (using underlying deepspeed

            function call)

        convert_to_sync_batch_norm: bool, default: False

            Automatically convert all batch norm calls to torch.nn.SyncBatchNorm calls

            https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html

        backend: BackendOptions, default: 'nccl'

            Which communication backend to use

        broadcast_buffers: bool, default: True

            Flag that enables syncing (broadcasting) buffers of the module at beginning of the forward function

        bucket_cap_mb: int, default: 25

            DistributedDataParallel will bucket parameters into multiple buckets so that gradient reduction of each bucket

            can potentially overlap with backward computation. bucket_cap_mb controls the bucket size in MegaBytes (MB)

        find_unused_parameters: bool, default: False

            Traverse the autograd graph from all tensors contained in the return value of the wrapped module’s forward

            function. Parameters that don’t receive gradients as part of this graph are preemptively marked as being ready

            to be reduced. Note that all forward outputs that are derived from module parameters must participate in

            calculating loss and later the gradient computation. If they don’t, this wrapper will hang waiting for autograd

            to produce gradients for those parameters. Any outputs derived from module parameters that are otherwise unused

            can be detached from the autograd graph using torch.Tensor.detach

        gradient_as_bucket_view: bool, default: False

            When set to True, gradients will be views pointing to different offsets of allreduce communication

            buckets. This can reduce peak memory usage, where the saved memory size will be equal to the total gradients

            size. Moreover, it avoids the overhead of copying between gradients and allreduce communication buckets. When

            gradients are views, detach_() cannot be called on the gradients. If hitting such errors, please fix it by

            referring to the zero_grad() function in torch/optim/optimizer.py as a solution.

        init_method: str, default: 'env://'

            URL specifying how to initialize the process group

        no_sync: bool, default: True

            for any DDP based method (including SDDP and FSDP wrappers -- if activated gradients will be accumulated on

            module variables, which will later be synchronized in the first forward-backward pass after exiting the

            context. no sync might lead to higher memory usage but lower communication overhead



        """



        local_rank: Optional[int]

        auto_mpi_discovery: bool = False

        convert_to_sync_batch_norm: bool = False

        backend: BackendOptions = "nccl"

        broadcast_buffers: bool = True

        bucket_cap_mb: int = 25

        find_unused_parameters: bool = False

        gradient_as_bucket_view: bool = False

        init_method: str = "env://"

        no_sync: bool = True





    @attr.s(auto_attribs=True)

    class DeepspeedAIOConfig:

        """Deepspeed asynchronous I/O configuration class



        Attributes

        ----------

        block_size: int, default: 1048576

            I/O block size in bytes

        ignore_unused_parameters: bool, default: True

            Unused parameters in modules may be unexpected in static networks, but could be normal in dynamic networks.

            This controls whether or not training should terminate with an error message when unused parameters are

            detected.

        overlap_events: bool, default: True

            Submit requests to storage device in an overlapped fashion without waiting for completion of earlier requests.

        queue_depth: int, default: 8

            I/O queue depth

        single_submit: bool, default: False

            Submit requests to storage device as multiple individual requests as opposed to one block of requests.

        thread_count: int, default: 1

            Intra-request parallelism for each read/write submitted by a user thread.



        """



        block_size: int = 1048576

        ignore_unused_parameters: bool = True

        overlap_events: bool = True

        queue_depth: int = 8

        single_submit: bool = False

        thread_count: int = 1





    @attr.s(auto_attribs=True)

    class DeepspeedActivationCheckpointingConfig:

        """Deepspeed activation checkpointing configuration class



        Attributes

        ----------

        contiguous_memory_optimization: bool, default: False

            Copies partitioned activations so that they are contiguous in memory

        cpu_checkpointing: bool, default: False

            Offloads partitioned activations to CPU if partition_activations is enabled

        number_checkpoints: Optional[int], default: None

            Total number of activation checkpoints used to allocate memory buffer for contiguous_memoty_optimization

        partition_activations: bool, default: False

            Enables partition activation when used with model parallelism

        profile: bool, default: False

            Logs the forward and backward time for each checkpoint function

        synchronize_checkpoint_boundary: bool, default: False

            Inserts torch.cuda.synchronize() at each checkpoint boundary



        """



        contiguous_memory_optimization: bool = False

        cpu_checkpointing: bool = False

        number_checkpoints: Optional[int] = None

        partition_activations: bool = False

        profile: bool = False

        synchronize_checkpoint_boundary: bool = False





    @attr.s(auto_attribs=True)

    class DeepspeedFlopsConfig:

        """Deepspeed flops profiler configuration class



        Attributes

        ----------

        detailed: bool, default: True

            Whether to print the detailed model profile

        module_depth: int, default: -1

            The depth of the model at which to print the aggregated module information. When set to -1, it prints

            information from the top module to the innermost modules (the maximum depth).

        output_file: Optional[str], default: None

            Path to the output file. If None, the profiler prints to stdout

        profile_step: int, default: 1

            The global training step at which to profile.

        top_modules: int, default: 1

            Limits the aggregated profile output to the number of top modules specified.



        Notes

        -----

        Warm up steps are needed for accurate time measurement



        """



        detailed: bool = True

        module_depth: int = -1

        output_file: Optional[str] = None

        profile_step: int = 1

        top_modules: int = 1





    @attr.s(auto_attribs=True)

    class DeepspeedFP16Config:

        """Deepspeed FP16 configuration class



        Attributes

        ----------

        hysteresis: int, default: 2

            represents the delay shift in dynamic loss scaling

        initial_scale_power: int, default: 32

            power of the initial dynamic loss scale value. The actual loss scale is computed as 2 ** initial_scale_power

        loss_scale: float, default: 0.0

            loss scaling value for FP16 training (0.0 --> dynamic scaling)

        loss_scale_window: int, default: 1000

            the window over which to raise/lower the dynamic loss scale value

        min_loss_scale: int, default: 1000

            minimum dynamic loss scale value



        """



        hysteresis: int = 2

        initial_scale_power: int = 32

        loss_scale: float = 0.0

        loss_scale_window: int = 1000

        min_loss_scale: int = 1000





    @attr.s(auto_attribs=True)

    class DeepspeedOffloadOptimizerConfig:

        """Deepspeed optimizer offloading configuration class



        Attributes

        ----------

        buffer_count: int, default: 4

            Number of buffers in buffer pool for optimizer state offloading to NVMe. This should be at least the number

            of states maintained per parameter by the optimizer. For example, Adam optimizer has 4 states (parameter,

            gradient, momentum, and variance).

        device: OffloadDevice, default: 'cpu'

            Device memory to offload optimizer state

        fast_init: bool, default: False

            Enable fast optimizer initialization when offloading to NVMe

        nvme_path: str, default: '/local_nvme'

            Filesystem path for NVMe device for optimizer state offloading

        pin_memory: bool, default: False

            Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead.

        pipeline: bool, default: False

            pipeline activated (will default to True if either pipeline_read or pipeline_write is set

        pipeline_read: bool, default: False

            activate pipeline read (deepspeed has limited docs for what this does)

        pipeline_write: bool, default: False

            activate pipeline write(deepspeed has limited docs for what this does)



        """



        buffer_count: int = 4

        device: OffloadDevice = "cpu"

        fast_init: bool = False

        nvme_path: str = "/local_nvme"

        pin_memory: bool = False

        pipeline: bool = False

        pipeline_read: bool = False

        pipeline_write: bool = False





    @attr.s(auto_attribs=True)

    class DeepspeedOffloadParamConfig:

        """Deepspeed parameter offloading configuration class



        Attributes

        ----------

        buffer_count: int, default: 5

            Number of buffers in buffer pool for parameter offloading to NVMe

        buffer_size: int, default: int(1E8)

            Size of buffers in buffer pool for parameter offloading to NVMe

        device: OffloadDevice, default: 'cpu'

            Device memory to offload model parameters

        max_in_cpu: int, default: int(1E9)

            Number of parameter elements to maintain in CPU memory when offloading to NVMe is enabled.

        nvme_path: str, default: '/local_nvme'

            Filesystem path for NVMe device for parameter offloading

        pin_memory: bool, default: False

            Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead.



        """



        buffer_count: int = 5

        buffer_size: int = int(1e8)

        device: OffloadDevice = "cpu"

        max_in_cpu: int = int(1e9)

        nvme_path: str = "/local_nvme"

        pin_memory: bool = False





    @attr.s(auto_attribs=True)

    class DeepspeedPLDConfig:

        """

        Attributes

        ----------

        theta: float, default: 1.0

            Hyper-parameter that controls the trade-off between training time and robustness. The lower the theta value,

            the faster the training speed

        gamma: float, default: 0.001

            Hyper-parameter that controls how fast the drop ratio increases



        """



        theta: float = 1.0

        gamma: float = 0.001





    @attr.s(auto_attribs=True)

    class DeepspeedTensorboardConfig:

        """Deepspeed Tensorboard configuration class



        Attributes

        ----------

        output_path: str, default: ''

            Tensorboard output path

        job_name: str, default: 'DeepSpeedJobName'

            Tensorboard job name



        """



        output_path: str = ""

        job_name: str = "DeepSpeedJobName"





    @attr.s(auto_attribs=True)

    class DeepspeedZeROConfig:

        """Deepspeed ZeRO configuration class



        Attributes

        ----------

        allgather_bucket_size: int, default: int(5E8)

            Number of elements allgathered at a time. Limits the memory required for the allgather for large model sizes

        allgather_partitions: bool, default: True

            Chooses between allgather collective or a series of broadcast collectives to gather updated parameters

            from all the GPUs at the end of each step

        contiguous_gradients: bool, default: False

            Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward

            pass. Only useful when running very large models.

        ignore_unused_parameters: bool, default: True

            Now just used in stage2 complete_grad_norm_calculation_for_cpu_offload

            Enable this option to avoid -- https://github.com/microsoft/DeepSpeed/issues/707

        legacy_stage1: bool, default: False

            Use deepspeed < v0.3.17 zero stage 1, kept for backwards compatability reasons

        offload_optimizer: Optional[DeepspeedOffloadOptimizerConfig], default: None

            Enable offloading of optimizer state to CPU or NVMe, and optimizer computation to CPU. This frees up GPU

            memory for larger models or batch sizes. Valid only with stage 3

        offload_param: Optional[DeepspeedOffloadParamConfig], default: None

            Enable offloading of model parameters to CPU or NVMe. This frees up GPU memory for larger models or batch

            sizes. Valid only with stage 3.

        overlap_comm: bool, default: False

            Attempts to overlap the reduction of the gradients with backward computation

        reduce_bucket_size: int, default: int(5E8)

            Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large

            model sizes

        reduce_scatter: bool, default: True

            Uses reduce or reduce scatter instead of allreduce to average gradients

        stage: int, default: 0

            Chooses different stages of ZeRO Optimizer. Stage 0, 1, 2, and 3 refer to disabled, optimizer state

            partitioning, and optimizer+gradient state partitioning, and optimizer+gradient+parameter partitioning,

            respectively

        stage3_max_live_parameters: int, default: int(1E9)

            The maximum number of parameters resident per GPU before releasing. Smaller values use less memory, but

            perform more communication.

        stage3_max_reuse_distance: int, default: int(1E9)

            Do not release a parameter if it will be reused within this threshold of parameters. Smaller values use less

            memory, but perform more communication.

        stage3_prefetch_bucket_size: int, default: int(5E8)

            The size of the fixed buffer for prefetching parameters. Smaller values use less memory, but can increase

            stalls due to communication.

        stage3_param_persistence_threshold: int, default: int(1E6)

            Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly

            increase communication (especially latency-bound messages).

        stage3_gather_fp16_weights_on_model_save: bool, default: False

            Consolidate the weights before saving the model by save_fp16_model(). Since the weights are partitioned

            across GPUs, they aren’t part of state_dict, so this function automatically gather the weights when this

            option is enabled and then saves the fp16 model weights.

        sub_group_size: int, default: int(1E12)

            sub_group_size controls the granularity in which parameters are updated during optimizer steps. Parameters are

            grouped into buckets of sub_group_size and each buckets is updated one at a time.



        """



        allgather_bucket_size: int = int(5e8)

        allgather_partitions: bool = True

        contiguous_gradients: bool = False

        ignore_unused_parameters: bool = True

        legacy_stage1: bool = False

        offload_optimizer: Optional[DeepspeedOffloadOptimizerConfig] = None

        offload_param: Optional[DeepspeedOffloadParamConfig] = None

        overlap_comm: bool = False

        reduce_bucket_size: int = int(5e8)

        reduce_scatter: bool = True

        stage: int = 0

        stage3_max_live_parameters: int = int(1e9)

        stage3_max_reuse_distance: int = int(1e9)

        stage3_prefetch_bucket_size: int = int(5e8)

        stage3_param_persistence_threshold: int = int(1e6)

        stage3_gather_fp16_weights_on_model_save: bool = False

        sub_group_size: int = int(1e12)





    @attr.s(auto_attribs=True)

    class DeepspeedConfig:

        """Deepspeed configuration class



        Composed of other configuration classes related to specific functionality



        Attributes

        ----------

        activation_checkpointing: Optional[DeepspeedActivationCheckpointingConfig], default: DeepspeedActivationCheckpointingConfig()

            Enables and configures activation checkpointing

        aio: Optional[DeepspeedAIOConfig], default: DeepspeedAIOConfig()

            Configuring the asynchronous I/O module for offloading parameter and optimizer states to persistent

            (NVMe) storage

        auto_mpi_discovery: bool, default: True

            if distributed environment variables are not set, attempt to discover them from MPI

        disable_allgather: bool, default: False

            Disables allgather

        dist_backend: BackendOptions, default: 'nccl'

            Which communication backend to use

        distributed_port: int, default: 29500

            torch distributed backend port

        dump_state: bool, default: False

            Print out state information of DeepSpeed object after initialization

        flops_profiler: Optional[DeepspeedFlopsConfig], default: None

            Enables and configures the flops profiler. This would also enable wall_clock_breakdown

        fp16: Optional[DeepspeedFP16Config], default: None

            Enables and configures mixed precision/FP16 training that leverages NVIDIA’s Apex package

        fp32_allreduce: bool, default: False

            During gradient averaging perform allreduce with 32 bit values

        gradient_predivide_factor: float, default: 1.0

            Before gradient averaging predivide gradients by a specified factor, can sometimes help with fp16 stability

            when scaling to large numbers of GPUs

        init_method: str, default: 'env://'

            URL specifying how to initialize the process group

        prescale_gradients: float, default: 1.0

            Scale gradients before doing allreduce

        progressive_layer_drop: Optional[DeepspeedPLDConfig], default: None

            Enables and configures progressive layer dropping

        sparse_gradients: bool, default: False

            Enable sparse compression of torch.nn.Embedding gradients

        steps_per_print: int, default: 10

            Print train loss every N steps

        tensorboard: Optional[DeepspeedTensorboardConfig], default: None

            Enables and configures tensorboard support

        verbose: bool, default: True

            flag to make deepspeed engine verbose with information

        wall_clock_breakdown: bool, default: False

            Enable timing of the latency of forward/backward/update training phases

        zero_optimization: Optional[DeepspeedZeROConfig], default: DeepspeedZeROConfig()

            Enables and configures ZeRO memory optimizations



        Notes

        -----

        Deepspeed does not use Apex’s AMP mode whihc allows for more flexibility in mixed precision training modes. FP16

        here is similar to AMP’s O2 mode



        """



        activation_checkpointing: Optional[

            DeepspeedActivationCheckpointingConfig

        ] = DeepspeedActivationCheckpointingConfig()

        aio: Optional[DeepspeedAIOConfig] = DeepspeedAIOConfig()

        auto_mpi_discovery: bool = True

        disable_allgather: bool = False

        dist_backend: BackendOptions = "nccl"

        distributed_port: int = 29500

        dump_state: bool = False

        flops_profiler: Optional[DeepspeedFlopsConfig] = None

        fp16: Optional[DeepspeedFP16Config] = None

        fp32_allreduce: bool = False

        gradient_predivide_factor: float = 1.0

        init_method: str = "env://"

        prescale_gradients: bool = False

        progressive_layer_drop: Optional[DeepspeedPLDConfig] = None

        sparse_gradients: bool = False

        steps_per_print: int = 10

        tensorboard: Optional[DeepspeedTensorboardConfig] = None

        verbose: bool = True

        wall_clock_breakdown: bool = False

        zero_optimization: Optional[DeepspeedZeROConfig] = DeepspeedZeROConfig()





    @attr.s(auto_attribs=True)

    class FairscaleOSSConfig:

        """Fairscale optimizer state sharding configuration class



        Attributes

        ----------

        broadcast_fp16: bool, default: False

            Compress the model shards in fp16 before sharing them in between ranks. This is safe to use when PyTorch AMP

            is activated. Without torch AMP this will lead to a slight degradation in terms of accuracy.



        """



        broadcast_fp16: bool = False





    @attr.s(auto_attribs=True)

    class FairscaleSDDPConfig:

        """Fairscale sharded data parallel (SDDP) configuration class



        Attributes

        ----------

        auto_refresh_trainable: bool, default: True

            Check whether the parameters trainability (requires_grad) has changed and update both ShardedDDP and OSS

            automatically if this is the case. If set to False, refresh_trainable() needs to be called anytime a

            parameter is frozen or unfrozen

        broadcast_buffers: bool, default: True

            Whether to additionally broadcast model buffers in between ranks at the beginning of each forward pass. Same

            setting as in Pytorch DDP, this is in addition to the broadcast and reduction of the model parameters.

        reduce_buffer_size: int, default: 2 ** 23

            he max size of the buffer used to batch the small parameter tensors, in number of elements. This will impact

            the long term memory consumption, because these buckets correspond to parameters which will not be sharded.

            Set to 0 to remove all bucketing, 1M to 8M is usually reasonable.

        reduce_fp16: bool, default: False

            cast the grads to fp16 before reducing. Not needed if the model is already fp16, but will probably improve

            performance for multi node jobs using PyTorch AMP. The effect is similar to DDP’s fp16_compress_hook and

            will also save some memory.

        sync_models_at_startup: bool, default: True

            Synchronize the models in between the ranks when starting up. Not needed if each rank has the same seed, or

            the training restarts from a saved state



        """



        auto_refresh_trainable: bool = True

        broadcast_buffers: bool = True

        reduce_buffer_size: int = 2 ** 23

        reduce_fp16: bool = False

        sync_models_at_startup: bool = True





    @attr.s(auto_attribs=True)

    class FairscaleFSDPConfig:

        """Fairscale Fully Sharded Data Parallel configuration class



        Attributes

        ----------

        bucket_cap_mb: int, default: 25

            FSDP will bucket parameters so that gradient reduction can be more efficient for small parameters.

            bucket_cap_mb controls the bucket size in MegaBytes (MB). Buckets are sub-divided based on world_size, so the

            max shard size is roughly bucket_cap_mb / world_size. There is one bucketer (with potentially multiple

            bucket_cap_mb sized buffers shared by all FSDP instances. Large gradient tensors are directly reduced without

            using the buffers. The buffers are there to reduce communication overhead for small tensors. Overlapping with

            computation happens due to use of a different CUDA stream than the computation CUDA stream. The total memory

            overhead per buffer is around bucket_cap_mb / world_size * (world_size + 1). The buffers are allocated during

            the backward pass and freed at the end of the backward pass to save more memory for other phases of the

            training process. Note, the memory vs. speed tradeoff of bucket size is very different from that of the DDP

            engine. In DDP, the buffer size 1MB + n*cap_mb, until n is big enough to cover the entire model size. The

            order of which buffer is ready there is more rigid and DDP requires all gradients to be computed in the

            backward. In FSDP, the buffer size does not change with model size (it changes based on number of

            <dtype, device, process_group> tuples) and gradient ready order matters little since FSDP has a final flush

            call that ensures everything is reduced and not all gradients need to be upfront known. Overlapping with

            compute is done differently too. Values <= 0 disable bucketing

        buffer_dtype: Optional[torch.dtype], default: None

            dtype for buffers for computation. defaults to value of compute_dtype

        clear_autocast_cache: bool, default: False

            When using mixed precision training with FP16 AMP, if the model weights are in FP32, autocast

            maintains a cache for downcasted weights. The cache can cause GPU OOM during the forward pass. Setting this

            flag to true will help clearing this cache as inner FSDP instances finish part of the forward pass to save

            GPU memory

        compute_dtype: Optional[torch.dtype], default: None

            dtype for full parameters for computation. This defaults to torch.float32 unless FP 16 AMP is set,

            in which case it defaults to torch.float16.

        flatten_parameters: bool, default: True

            flatten parameters into a single contiguous tensor, which improves training speed

        force_input_to_fp32: bool, default: False:

            force input floating point tensors to be FP32 (if they are FP16) when the FSDP instance is in full precision

            mode. This helps avoid issues of running SyncBatchNorm with AMP and checkpoint_wrapper.

        fp32_reduce_scatter: bool, default: False

            reduce-scatter gradients in FP32. This is only relevant when FP16 AMP is used

        gradient_predivide_factor: Optional[float], default: None

            divide factor before the reduction

        gradient_postdivide_factor: Optional[float], default: None

            divide factor after the reduction

        move_grads_to_cpu: Optional[bool], default: None

            move gradient shard to CPU after reduction. This is only relevant when FP16 AMP is used

        move_params_to_cpu: bool, default: False

            offload FP32 params to CPU. This is only relevant when FP16 AMP is used

        no_broadcast_optim_state: Optional[bool], default: False

            do not broadcast this modules optimizer state when gather_full_optim_state_dict is called. If you set this

            true, you are expected to overwrite the relevant state entries of the returned optimizer state dict with the

            proper state at each rank. This is useful for situations, like Mixture Of Experts, where all but a few

            parameters can fit on one node

        reshard_after_forward: bool, default: True

            reshard parameters after the forward pass. This saves memory but slows training. This is only relevant

            when resharding individual layers (see https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html)

        verbose: bool, default: True

            turn on verbose output for model’s string representation



        Notes

        -----

        mixed_precision: bool

            This value will automatically be set from the Stoke FP16 selected option (AMP only)

        state_dict_device: torch.device

            this is not exposed as it should be managed internally from the DDP backend setup

        compute_device: torch.device

            this is not exposed as it should be managed internally from the DDP backend setup



        """



        bucket_cap_mb: int = 25

        buffer_dtype: Optional[torch.dtype] = None

        clear_autocast_cache: bool = False

        compute_dtype: Optional[torch.dtype] = None

        flatten_parameters: bool = True

        force_input_to_fp32: bool = False

        fp32_reduce_scatter: bool = False

        gradient_predivide_factor: Optional[float] = None

        gradient_postdivide_factor: Optional[float] = None

        move_grads_to_cpu: Optional[bool] = None

        move_params_to_cpu: bool = False

        no_broadcast_optim_state: Optional[bool] = False

        reshard_after_forward: bool = True

        verbose: bool = False





    @attr.s(auto_attribs=True)

    class HorovodConfig:

        """Horovod configuration class



        Attributes

        ----------

        compression: bool, default: False

            Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter

            update step.

        convert_to_sync_batch_norm: bool, default: False

            Automatically convert all batch norm calls to horovod.torch.SyncBatchNorm calls

            https://horovod.readthedocs.io/en/stable/api.html#horovod.torch.SyncBatchNorm

        gradient_predivide_factor: float, default: 1.0

            If op == Average, gradient_predivide_factor splits the averaging before and after the sum. Gradients are scaled

            by 1.0 / gradient_predivide_factor before the sum and gradient_predivide_factor / size after the sum.

        op: HorovodOps, default: 'Average'

            The reduction operation to use when combining gradients across different ranks.



        """



        compression: bool = False

        convert_to_sync_batch_norm: bool = False

        gradient_predivide_factor: float = 1.0

        op: HorovodOps = "Average"





    class StokeOptimizer(TypedDict):

        """Stoke optimizer wrapper class



        Given all the different backends and extensions the optimizer might need to be instantiated in a different way

        thus this typed dict holds the configuration without instantiation



        Attributes

        ----------

        optimizer: Type[torch.optim.Optimizer]

            un-instantiated torch.optim.Optimizer class

        optimizer_kwargs: Dict

            any keyword args to be unrolled into the optimizer at instantiation time



        """



        optimizer: Type[torch.optim.Optimizer]

        optimizer_kwargs: Dict

Classes

AMPConfig

class AMPConfig(
    backoff_factor: float = 0.5,
    growth_factor: float = 2.0,
    growth_interval: int = 2000,
    init_scale: float = 65536.0
)

Attributes

Name Type Description Default
backoff_factor float, default: 0.5 Factor by which the scale is multiplied during update if inf/NaN gradients occur in an iteration None
growth_factor float, default: 2.0 Factor by which the scale is multiplied during update if no inf/NaN gradients occur for growth_interval consecutive iterations. None
growth_interval int, default: 2000 Number of consecutive iterations without inf/NaN gradients that must occur for the scale to be multiplied by
growth_factor None
init_scale float, default: 2.**16 Initial scale factor None

??? example "View Source" class AMPConfig:

        """PyTorch AMP configuration class



        Attributes

        ----------

        backoff_factor : float, default: 0.5

            Factor by which the scale is multiplied during update if inf/NaN gradients occur in an iteration

        growth_factor : float, default: 2.0

            Factor by which the scale is multiplied during update if no inf/NaN gradients occur for growth_interval consecutive iterations.

        growth_interval : int, default: 2000

            Number of consecutive iterations without inf/NaN gradients that must occur for the scale to be multiplied by

            growth_factor

        init_scale : float, default: 2.**16

            Initial scale factor



        """



        backoff_factor: float = 0.5

        growth_factor: float = 2.0

        growth_interval: int = 2000

        init_scale: float = 2.0 ** 16

ApexConfig

class ApexConfig(
    cast_model_outputs: Union[torch.dtype, NoneType] = None,
    convert_to_sync_batch_norm: bool = False,
    max_loss_scale: float = 16777216.0,
    min_loss_scale: Union[float, NoneType] = None,
    scaler_per_loss: bool = False,
    verbosity: int = 0
)

Attributes

Name Type Description Default
cast_model_outputs Optional[torch.dtype], default: None Option to ensure that the outputs of your model(s) are always cast to a particular type regardless of opt_level None
convert_to_sync_batch_norm bool, default: False Automatically convert all batch norm calls to apex.parallel.SyncBatchNorm calls
https://nvidia.github.io/apex/parallel.html#apex.parallel.SyncBatchNorm None
max_loss_scale float, default: 2.**24 Sets a ceiling for the loss scale values that can be chosen by dynamic loss scaling None
min_loss_scale Optional[float], default: None Sets a floor for the loss scale values that can be chosen by dynamic loss scaling. The default value of None
means that no floor is imposed value
scaler_per_loss bool, default: False Option to impose a scaler for each loss instead of a global scaler None
verbosity int, default: 0 Set to 0 to suppress Amp-related output None

??? example "View Source" class ApexConfig:

        """Nvidia APEX configuration class



        Attributes

        ----------

        cast_model_outputs: Optional[torch.dtype], default: None

            Option to ensure that the outputs of your model(s) are always cast to a particular type regardless of opt_level

        convert_to_sync_batch_norm: bool, default: False

            Automatically convert all batch norm calls to apex.parallel.SyncBatchNorm calls

            https://nvidia.github.io/apex/parallel.html#apex.parallel.SyncBatchNorm

        max_loss_scale: float, default: 2.**24

            Sets a ceiling for the loss scale values that can be chosen by dynamic loss scaling

        min_loss_scale: Optional[float], default: None

            Sets a floor for the loss scale values that can be chosen by dynamic loss scaling. The default value of None

            means that no floor is imposed

        scaler_per_loss: bool, default: False

            Option to impose a scaler for each loss instead of a global scaler

        verbosity: int, default: 0

            Set to 0 to suppress Amp-related output



        """



        cast_model_outputs: Optional[torch.dtype] = None

        convert_to_sync_batch_norm: bool = False

        max_loss_scale: float = 2.0 ** 24

        min_loss_scale: Optional[float] = None

        scaler_per_loss: bool = False

        verbosity: int = 0

BackendOptions

class BackendOptions(
    /,
    *args,
    **kwargs
)

??? example "View Source" class BackendOptions(Enum):

        """Communication backend options"""



        nccl = "nccl"

        mpi = " mpi"

        gloo = "gloo"

Ancestors (in MRO)

  • enum.Enum

Class variables

gloo
mpi
name
nccl
value

ClipGradConfig

class ClipGradConfig(
    clip_value: float
)

Attributes

Name Type Description Default
clip_value float maximum allowed absolute value of the gradients [-clip_value, clip_value] None

??? example "View Source" class ClipGradConfig:

        """Gradient clipping by value configuration class



        Attributes

        ----------

        clip_value: float

            maximum allowed absolute value of the gradients [-clip_value, clip_value]



        """



        clip_value: float

ClipGradNormConfig

class ClipGradNormConfig(
    max_norm: float,
    norm_type: float
)

Attributes

Name Type Description Default
max_norm float max norm of the gradients None
norm_type float type of the used p-norm None

??? example "View Source" class ClipGradNormConfig:

        """Gradient clipping by p-norm configuration class



        Attributes

        ----------

        max_norm: float

            max norm of the gradients

        norm_type: float

            type of the used p-norm



        """



        max_norm: float

        norm_type: float

DDPConfig

class DDPConfig(
    local_rank: Union[int, NoneType],
    auto_mpi_discovery: bool = False,
    convert_to_sync_batch_norm: bool = False,
    backend: stoke.configs.BackendOptions = 'nccl',
    broadcast_buffers: bool = True,
    bucket_cap_mb: int = 25,
    find_unused_parameters: bool = False,
    gradient_as_bucket_view: bool = False,
    init_method: str = 'env://',
    no_sync: bool = True
)

Attributes

Name Type Description Default
local_rank Optional[int] Current local rank of the device (provided here, as LOCAL_RANK env var, or parsed from --local_arg) None
auto_mpi_discovery bool, default: False if distributed environment variables are not set, attempt to discover them from MPI (using underlying deepspeed
function call) None
convert_to_sync_batch_norm bool, default: False Automatically convert all batch norm calls to torch.nn.SyncBatchNorm calls
https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html None
backend BackendOptions, default: 'nccl' Which communication backend to use None
broadcast_buffers bool, default: True Flag that enables syncing (broadcasting) buffers of the module at beginning of the forward function None
bucket_cap_mb int, default: 25 DistributedDataParallel will bucket parameters into multiple buckets so that gradient reduction of each bucket
can potentially overlap with backward computation. bucket_cap_mb controls the bucket size in MegaBytes (MB) None
find_unused_parameters bool, default: False Traverse the autograd graph from all tensors contained in the return value of the wrapped module’s forward
function. Parameters that don’t receive gradients as part of this graph are preemptively marked as being ready
to be reduced. Note that all forward outputs that are derived from module parameters must participate in
calculating loss and later the gradient computation. If they don’t, this wrapper will hang waiting for autograd
to produce gradients for those parameters. Any outputs derived from module parameters that are otherwise unused
can be detached from the autograd graph using torch.Tensor.detach None
gradient_as_bucket_view bool, default: False When set to True, gradients will be views pointing to different offsets of allreduce communication
buckets. This can reduce peak memory usage, where the saved memory size will be equal to the total gradients
size. Moreover, it avoids the overhead of copying between gradients and allreduce communication buckets. When
gradients are views, detach_() cannot be called on the gradients. If hitting such errors, please fix it by
referring to the zero_grad() function in torch/optim/optimizer.py as a solution. None
init_method str, default: 'env://' URL specifying how to initialize the process group None
no_sync bool, default: True for any DDP based method (including SDDP and FSDP wrappers -- if activated gradients will be accumulated on
module variables, which will later be synchronized in the first forward-backward pass after exiting the
context. no sync might lead to higher memory usage but lower communication overhead None

??? example "View Source" class DDPConfig:

        """PyTorch DistributedDataParallel configuration class



        Attributes

        ----------

        local_rank: Optional[int]

            Current local rank of the device (provided here, as LOCAL_RANK env var, or parsed from --local_arg)

        auto_mpi_discovery: bool, default: False

            if distributed environment variables are not set, attempt to discover them from MPI (using underlying deepspeed

            function call)

        convert_to_sync_batch_norm: bool, default: False

            Automatically convert all batch norm calls to torch.nn.SyncBatchNorm calls

            https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html

        backend: BackendOptions, default: 'nccl'

            Which communication backend to use

        broadcast_buffers: bool, default: True

            Flag that enables syncing (broadcasting) buffers of the module at beginning of the forward function

        bucket_cap_mb: int, default: 25

            DistributedDataParallel will bucket parameters into multiple buckets so that gradient reduction of each bucket

            can potentially overlap with backward computation. bucket_cap_mb controls the bucket size in MegaBytes (MB)

        find_unused_parameters: bool, default: False

            Traverse the autograd graph from all tensors contained in the return value of the wrapped module’s forward

            function. Parameters that don’t receive gradients as part of this graph are preemptively marked as being ready

            to be reduced. Note that all forward outputs that are derived from module parameters must participate in

            calculating loss and later the gradient computation. If they don’t, this wrapper will hang waiting for autograd

            to produce gradients for those parameters. Any outputs derived from module parameters that are otherwise unused

            can be detached from the autograd graph using torch.Tensor.detach

        gradient_as_bucket_view: bool, default: False

            When set to True, gradients will be views pointing to different offsets of allreduce communication

            buckets. This can reduce peak memory usage, where the saved memory size will be equal to the total gradients

            size. Moreover, it avoids the overhead of copying between gradients and allreduce communication buckets. When

            gradients are views, detach_() cannot be called on the gradients. If hitting such errors, please fix it by

            referring to the zero_grad() function in torch/optim/optimizer.py as a solution.

        init_method: str, default: 'env://'

            URL specifying how to initialize the process group

        no_sync: bool, default: True

            for any DDP based method (including SDDP and FSDP wrappers -- if activated gradients will be accumulated on

            module variables, which will later be synchronized in the first forward-backward pass after exiting the

            context. no sync might lead to higher memory usage but lower communication overhead



        """



        local_rank: Optional[int]

        auto_mpi_discovery: bool = False

        convert_to_sync_batch_norm: bool = False

        backend: BackendOptions = "nccl"

        broadcast_buffers: bool = True

        bucket_cap_mb: int = 25

        find_unused_parameters: bool = False

        gradient_as_bucket_view: bool = False

        init_method: str = "env://"

        no_sync: bool = True

DeepspeedAIOConfig

class DeepspeedAIOConfig(
    block_size: int = 1048576,
    ignore_unused_parameters: bool = True,
    overlap_events: bool = True,
    queue_depth: int = 8,
    single_submit: bool = False,
    thread_count: int = 1
)

Attributes

Name Type Description Default
block_size int, default: 1048576 I/O block size in bytes None
ignore_unused_parameters bool, default: True Unused parameters in modules may be unexpected in static networks, but could be normal in dynamic networks.
This controls whether or not training should terminate with an error message when unused parameters are
detected. None
overlap_events bool, default: True Submit requests to storage device in an overlapped fashion without waiting for completion of earlier requests. None
queue_depth int, default: 8 I/O queue depth None
single_submit bool, default: False Submit requests to storage device as multiple individual requests as opposed to one block of requests. None
thread_count int, default: 1 Intra-request parallelism for each read/write submitted by a user thread. None

??? example "View Source" class DeepspeedAIOConfig:

        """Deepspeed asynchronous I/O configuration class



        Attributes

        ----------

        block_size: int, default: 1048576

            I/O block size in bytes

        ignore_unused_parameters: bool, default: True

            Unused parameters in modules may be unexpected in static networks, but could be normal in dynamic networks.

            This controls whether or not training should terminate with an error message when unused parameters are

            detected.

        overlap_events: bool, default: True

            Submit requests to storage device in an overlapped fashion without waiting for completion of earlier requests.

        queue_depth: int, default: 8

            I/O queue depth

        single_submit: bool, default: False

            Submit requests to storage device as multiple individual requests as opposed to one block of requests.

        thread_count: int, default: 1

            Intra-request parallelism for each read/write submitted by a user thread.



        """



        block_size: int = 1048576

        ignore_unused_parameters: bool = True

        overlap_events: bool = True

        queue_depth: int = 8

        single_submit: bool = False

        thread_count: int = 1

DeepspeedActivationCheckpointingConfig

class DeepspeedActivationCheckpointingConfig(
    contiguous_memory_optimization: bool = False,
    cpu_checkpointing: bool = False,
    number_checkpoints: Union[int, NoneType] = None,
    partition_activations: bool = False,
    profile: bool = False,
    synchronize_checkpoint_boundary: bool = False
)

Attributes

Name Type Description Default
contiguous_memory_optimization bool, default: False Copies partitioned activations so that they are contiguous in memory None
cpu_checkpointing bool, default: False Offloads partitioned activations to CPU if partition_activations is enabled None
number_checkpoints Optional[int], default: None Total number of activation checkpoints used to allocate memory buffer for contiguous_memoty_optimization None
partition_activations bool, default: False Enables partition activation when used with model parallelism None
profile bool, default: False Logs the forward and backward time for each checkpoint function None
synchronize_checkpoint_boundary bool, default: False Inserts torch.cuda.synchronize() at each checkpoint boundary None

??? example "View Source" class DeepspeedActivationCheckpointingConfig:

        """Deepspeed activation checkpointing configuration class



        Attributes

        ----------

        contiguous_memory_optimization: bool, default: False

            Copies partitioned activations so that they are contiguous in memory

        cpu_checkpointing: bool, default: False

            Offloads partitioned activations to CPU if partition_activations is enabled

        number_checkpoints: Optional[int], default: None

            Total number of activation checkpoints used to allocate memory buffer for contiguous_memoty_optimization

        partition_activations: bool, default: False

            Enables partition activation when used with model parallelism

        profile: bool, default: False

            Logs the forward and backward time for each checkpoint function

        synchronize_checkpoint_boundary: bool, default: False

            Inserts torch.cuda.synchronize() at each checkpoint boundary



        """



        contiguous_memory_optimization: bool = False

        cpu_checkpointing: bool = False

        number_checkpoints: Optional[int] = None

        partition_activations: bool = False

        profile: bool = False

        synchronize_checkpoint_boundary: bool = False

DeepspeedConfig

class DeepspeedConfig(
    activation_checkpointing: Union[stoke.configs.DeepspeedActivationCheckpointingConfig, NoneType] = DeepspeedActivationCheckpointingConfig(contiguous_memory_optimization=False, cpu_checkpointing=False, number_checkpoints=None, partition_activations=False, profile=False, synchronize_checkpoint_boundary=False),
    aio: Union[stoke.configs.DeepspeedAIOConfig, NoneType] = DeepspeedAIOConfig(block_size=1048576, ignore_unused_parameters=True, overlap_events=True, queue_depth=8, single_submit=False, thread_count=1),
    auto_mpi_discovery: bool = True,
    disable_allgather: bool = False,
    dist_backend: stoke.configs.BackendOptions = 'nccl',
    distributed_port: int = 29500,
    dump_state: bool = False,
    flops_profiler: Union[stoke.configs.DeepspeedFlopsConfig, NoneType] = None,
    fp16: Union[stoke.configs.DeepspeedFP16Config, NoneType] = None,
    fp32_allreduce: bool = False,
    gradient_predivide_factor: float = 1.0,
    init_method: str = 'env://',
    prescale_gradients: bool = False,
    progressive_layer_drop: Union[stoke.configs.DeepspeedPLDConfig, NoneType] = None,
    sparse_gradients: bool = False,
    steps_per_print: int = 10,
    tensorboard: Union[stoke.configs.DeepspeedTensorboardConfig, NoneType] = None,
    verbose: bool = True,
    wall_clock_breakdown: bool = False,
    zero_optimization: Union[stoke.configs.DeepspeedZeROConfig, NoneType] = DeepspeedZeROConfig(allgather_bucket_size=500000000, allgather_partitions=True, contiguous_gradients=False, ignore_unused_parameters=True, legacy_stage1=False, offload_optimizer=None, offload_param=None, overlap_comm=False, reduce_bucket_size=500000000, reduce_scatter=True, stage=0, stage3_max_live_parameters=1000000000, stage3_max_reuse_distance=1000000000, stage3_prefetch_bucket_size=500000000, stage3_param_persistence_threshold=1000000, stage3_gather_fp16_weights_on_model_save=False, sub_group_size=1000000000000)
)

Attributes

Name Type Description Default
activation_checkpointing Optional[DeepspeedActivationCheckpointingConfig], default: DeepspeedActivationCheckpointingConfig() Enables and configures activation checkpointing None
aio Optional[DeepspeedAIOConfig], default: DeepspeedAIOConfig() Configuring the asynchronous I/O module for offloading parameter and optimizer states to persistent
(NVMe) storage None
auto_mpi_discovery bool, default: True if distributed environment variables are not set, attempt to discover them from MPI None
disable_allgather bool, default: False Disables allgather None
dist_backend BackendOptions, default: 'nccl' Which communication backend to use None
distributed_port int, default: 29500 torch distributed backend port None
dump_state bool, default: False Print out state information of DeepSpeed object after initialization None
flops_profiler Optional[DeepspeedFlopsConfig], default: None Enables and configures the flops profiler. This would also enable wall_clock_breakdown None
fp16 Optional[DeepspeedFP16Config], default: None Enables and configures mixed precision/FP16 training that leverages NVIDIA’s Apex package None
fp32_allreduce bool, default: False During gradient averaging perform allreduce with 32 bit values None
gradient_predivide_factor float, default: 1.0 Before gradient averaging predivide gradients by a specified factor, can sometimes help with fp16 stability
when scaling to large numbers of GPUs None
init_method str, default: 'env://' URL specifying how to initialize the process group None
prescale_gradients float, default: 1.0 Scale gradients before doing allreduce None
progressive_layer_drop Optional[DeepspeedPLDConfig], default: None Enables and configures progressive layer dropping None
sparse_gradients bool, default: False Enable sparse compression of torch.nn.Embedding gradients None
steps_per_print int, default: 10 Print train loss every N steps None
tensorboard Optional[DeepspeedTensorboardConfig], default: None Enables and configures tensorboard support None
verbose bool, default: True flag to make deepspeed engine verbose with information None
wall_clock_breakdown bool, default: False Enable timing of the latency of forward/backward/update training phases None
zero_optimization Optional[DeepspeedZeROConfig], default: DeepspeedZeROConfig() Enables and configures ZeRO memory optimizations None

??? example "View Source" class DeepspeedConfig:

        """Deepspeed configuration class



        Composed of other configuration classes related to specific functionality



        Attributes

        ----------

        activation_checkpointing: Optional[DeepspeedActivationCheckpointingConfig], default: DeepspeedActivationCheckpointingConfig()

            Enables and configures activation checkpointing

        aio: Optional[DeepspeedAIOConfig], default: DeepspeedAIOConfig()

            Configuring the asynchronous I/O module for offloading parameter and optimizer states to persistent

            (NVMe) storage

        auto_mpi_discovery: bool, default: True

            if distributed environment variables are not set, attempt to discover them from MPI

        disable_allgather: bool, default: False

            Disables allgather

        dist_backend: BackendOptions, default: 'nccl'

            Which communication backend to use

        distributed_port: int, default: 29500

            torch distributed backend port

        dump_state: bool, default: False

            Print out state information of DeepSpeed object after initialization

        flops_profiler: Optional[DeepspeedFlopsConfig], default: None

            Enables and configures the flops profiler. This would also enable wall_clock_breakdown

        fp16: Optional[DeepspeedFP16Config], default: None

            Enables and configures mixed precision/FP16 training that leverages NVIDIA’s Apex package

        fp32_allreduce: bool, default: False

            During gradient averaging perform allreduce with 32 bit values

        gradient_predivide_factor: float, default: 1.0

            Before gradient averaging predivide gradients by a specified factor, can sometimes help with fp16 stability

            when scaling to large numbers of GPUs

        init_method: str, default: 'env://'

            URL specifying how to initialize the process group

        prescale_gradients: float, default: 1.0

            Scale gradients before doing allreduce

        progressive_layer_drop: Optional[DeepspeedPLDConfig], default: None

            Enables and configures progressive layer dropping

        sparse_gradients: bool, default: False

            Enable sparse compression of torch.nn.Embedding gradients

        steps_per_print: int, default: 10

            Print train loss every N steps

        tensorboard: Optional[DeepspeedTensorboardConfig], default: None

            Enables and configures tensorboard support

        verbose: bool, default: True

            flag to make deepspeed engine verbose with information

        wall_clock_breakdown: bool, default: False

            Enable timing of the latency of forward/backward/update training phases

        zero_optimization: Optional[DeepspeedZeROConfig], default: DeepspeedZeROConfig()

            Enables and configures ZeRO memory optimizations



        Notes

        -----

        Deepspeed does not use Apex’s AMP mode whihc allows for more flexibility in mixed precision training modes. FP16

        here is similar to AMP’s O2 mode



        """



        activation_checkpointing: Optional[

            DeepspeedActivationCheckpointingConfig

        ] = DeepspeedActivationCheckpointingConfig()

        aio: Optional[DeepspeedAIOConfig] = DeepspeedAIOConfig()

        auto_mpi_discovery: bool = True

        disable_allgather: bool = False

        dist_backend: BackendOptions = "nccl"

        distributed_port: int = 29500

        dump_state: bool = False

        flops_profiler: Optional[DeepspeedFlopsConfig] = None

        fp16: Optional[DeepspeedFP16Config] = None

        fp32_allreduce: bool = False

        gradient_predivide_factor: float = 1.0

        init_method: str = "env://"

        prescale_gradients: bool = False

        progressive_layer_drop: Optional[DeepspeedPLDConfig] = None

        sparse_gradients: bool = False

        steps_per_print: int = 10

        tensorboard: Optional[DeepspeedTensorboardConfig] = None

        verbose: bool = True

        wall_clock_breakdown: bool = False

        zero_optimization: Optional[DeepspeedZeROConfig] = DeepspeedZeROConfig()

DeepspeedFP16Config

class DeepspeedFP16Config(
    hysteresis: int = 2,
    initial_scale_power: int = 32,
    loss_scale: float = 0.0,
    loss_scale_window: int = 1000,
    min_loss_scale: int = 1000
)

Attributes

Name Type Description Default
hysteresis int, default: 2 represents the delay shift in dynamic loss scaling None
initial_scale_power int, default: 32 power of the initial dynamic loss scale value. The actual loss scale is computed as 2 ** initial_scale_power None
loss_scale float, default: 0.0 loss scaling value for FP16 training (0.0 --> dynamic scaling) None
loss_scale_window int, default: 1000 the window over which to raise/lower the dynamic loss scale value None
min_loss_scale int, default: 1000 minimum dynamic loss scale value None

??? example "View Source" class DeepspeedFP16Config:

        """Deepspeed FP16 configuration class



        Attributes

        ----------

        hysteresis: int, default: 2

            represents the delay shift in dynamic loss scaling

        initial_scale_power: int, default: 32

            power of the initial dynamic loss scale value. The actual loss scale is computed as 2 ** initial_scale_power

        loss_scale: float, default: 0.0

            loss scaling value for FP16 training (0.0 --> dynamic scaling)

        loss_scale_window: int, default: 1000

            the window over which to raise/lower the dynamic loss scale value

        min_loss_scale: int, default: 1000

            minimum dynamic loss scale value



        """



        hysteresis: int = 2

        initial_scale_power: int = 32

        loss_scale: float = 0.0

        loss_scale_window: int = 1000

        min_loss_scale: int = 1000

DeepspeedFlopsConfig

class DeepspeedFlopsConfig(
    detailed: bool = True,
    module_depth: int = -1,
    output_file: Union[str, NoneType] = None,
    profile_step: int = 1,
    top_modules: int = 1
)

Attributes

Name Type Description Default
detailed bool, default: True Whether to print the detailed model profile None
module_depth int, default: -1 The depth of the model at which to print the aggregated module information. When set to -1, it prints
information from the top module to the innermost modules (the maximum depth). None
output_file Optional[str], default: None Path to the output file. If None, the profiler prints to stdout None
profile_step int, default: 1 The global training step at which to profile. None
top_modules int, default: 1 Limits the aggregated profile output to the number of top modules specified. None

??? example "View Source" class DeepspeedFlopsConfig:

        """Deepspeed flops profiler configuration class



        Attributes

        ----------

        detailed: bool, default: True

            Whether to print the detailed model profile

        module_depth: int, default: -1

            The depth of the model at which to print the aggregated module information. When set to -1, it prints

            information from the top module to the innermost modules (the maximum depth).

        output_file: Optional[str], default: None

            Path to the output file. If None, the profiler prints to stdout

        profile_step: int, default: 1

            The global training step at which to profile.

        top_modules: int, default: 1

            Limits the aggregated profile output to the number of top modules specified.



        Notes

        -----

        Warm up steps are needed for accurate time measurement



        """



        detailed: bool = True

        module_depth: int = -1

        output_file: Optional[str] = None

        profile_step: int = 1

        top_modules: int = 1

DeepspeedOffloadOptimizerConfig

class DeepspeedOffloadOptimizerConfig(
    buffer_count: int = 4,
    device: stoke.configs.OffloadDevice = 'cpu',
    fast_init: bool = False,
    nvme_path: str = '/local_nvme',
    pin_memory: bool = False,
    pipeline: bool = False,
    pipeline_read: bool = False,
    pipeline_write: bool = False
)

Attributes

Name Type Description Default
buffer_count int, default: 4 Number of buffers in buffer pool for optimizer state offloading to NVMe. This should be at least the number
of states maintained per parameter by the optimizer. For example, Adam optimizer has 4 states (parameter,
gradient, momentum, and variance). None
device OffloadDevice, default: 'cpu' Device memory to offload optimizer state None
fast_init bool, default: False Enable fast optimizer initialization when offloading to NVMe None
nvme_path str, default: '/local_nvme' Filesystem path for NVMe device for optimizer state offloading None
pin_memory bool, default: False Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead. None
pipeline bool, default: False pipeline activated (will default to True if either pipeline_read or pipeline_write is set to
pipeline_read bool, default: False activate pipeline read (deepspeed has limited docs for what this does) None
pipeline_write bool, default: False activate pipeline write(deepspeed has limited docs for what this does) None

??? example "View Source" class DeepspeedOffloadOptimizerConfig:

        """Deepspeed optimizer offloading configuration class



        Attributes

        ----------

        buffer_count: int, default: 4

            Number of buffers in buffer pool for optimizer state offloading to NVMe. This should be at least the number

            of states maintained per parameter by the optimizer. For example, Adam optimizer has 4 states (parameter,

            gradient, momentum, and variance).

        device: OffloadDevice, default: 'cpu'

            Device memory to offload optimizer state

        fast_init: bool, default: False

            Enable fast optimizer initialization when offloading to NVMe

        nvme_path: str, default: '/local_nvme'

            Filesystem path for NVMe device for optimizer state offloading

        pin_memory: bool, default: False

            Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead.

        pipeline: bool, default: False

            pipeline activated (will default to True if either pipeline_read or pipeline_write is set

        pipeline_read: bool, default: False

            activate pipeline read (deepspeed has limited docs for what this does)

        pipeline_write: bool, default: False

            activate pipeline write(deepspeed has limited docs for what this does)



        """



        buffer_count: int = 4

        device: OffloadDevice = "cpu"

        fast_init: bool = False

        nvme_path: str = "/local_nvme"

        pin_memory: bool = False

        pipeline: bool = False

        pipeline_read: bool = False

        pipeline_write: bool = False

DeepspeedOffloadParamConfig

class DeepspeedOffloadParamConfig(
    buffer_count: int = 5,
    buffer_size: int = 100000000,
    device: stoke.configs.OffloadDevice = 'cpu',
    max_in_cpu: int = 1000000000,
    nvme_path: str = '/local_nvme',
    pin_memory: bool = False
)

Attributes

Name Type Description Default
buffer_count int, default: 5 Number of buffers in buffer pool for parameter offloading to NVMe None
buffer_size int, default: int(1E8) Size of buffers in buffer pool for parameter offloading to NVMe None
device OffloadDevice, default: 'cpu' Device memory to offload model parameters None
max_in_cpu int, default: int(1E9) Number of parameter elements to maintain in CPU memory when offloading to NVMe is enabled. None
nvme_path str, default: '/local_nvme' Filesystem path for NVMe device for parameter offloading None
pin_memory bool, default: False Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead. None

??? example "View Source" class DeepspeedOffloadParamConfig:

        """Deepspeed parameter offloading configuration class



        Attributes

        ----------

        buffer_count: int, default: 5

            Number of buffers in buffer pool for parameter offloading to NVMe

        buffer_size: int, default: int(1E8)

            Size of buffers in buffer pool for parameter offloading to NVMe

        device: OffloadDevice, default: 'cpu'

            Device memory to offload model parameters

        max_in_cpu: int, default: int(1E9)

            Number of parameter elements to maintain in CPU memory when offloading to NVMe is enabled.

        nvme_path: str, default: '/local_nvme'

            Filesystem path for NVMe device for parameter offloading

        pin_memory: bool, default: False

            Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead.



        """



        buffer_count: int = 5

        buffer_size: int = int(1e8)

        device: OffloadDevice = "cpu"

        max_in_cpu: int = int(1e9)

        nvme_path: str = "/local_nvme"

        pin_memory: bool = False

DeepspeedPLDConfig

class DeepspeedPLDConfig(
    theta: float = 1.0,
    gamma: float = 0.001
)

Attributes

Name Type Description Default
theta float, default: 1.0 Hyper-parameter that controls the trade-off between training time and robustness. The lower the theta value,
the faster the training speed None
gamma float, default: 0.001 Hyper-parameter that controls how fast the drop ratio increases None

??? example "View Source" class DeepspeedPLDConfig:

        """

        Attributes

        ----------

        theta: float, default: 1.0

            Hyper-parameter that controls the trade-off between training time and robustness. The lower the theta value,

            the faster the training speed

        gamma: float, default: 0.001

            Hyper-parameter that controls how fast the drop ratio increases



        """



        theta: float = 1.0

        gamma: float = 0.001

DeepspeedTensorboardConfig

class DeepspeedTensorboardConfig(
    output_path: str = '',
    job_name: str = 'DeepSpeedJobName'
)

Attributes

Name Type Description Default
output_path str, default: '' Tensorboard output path None
job_name str, default: 'DeepSpeedJobName' Tensorboard job name None

??? example "View Source" class DeepspeedTensorboardConfig:

        """Deepspeed Tensorboard configuration class



        Attributes

        ----------

        output_path: str, default: ''

            Tensorboard output path

        job_name: str, default: 'DeepSpeedJobName'

            Tensorboard job name



        """



        output_path: str = ""

        job_name: str = "DeepSpeedJobName"

DeepspeedZeROConfig

class DeepspeedZeROConfig(
    allgather_bucket_size: int = 500000000,
    allgather_partitions: bool = True,
    contiguous_gradients: bool = False,
    ignore_unused_parameters: bool = True,
    legacy_stage1: bool = False,
    offload_optimizer: Union[stoke.configs.DeepspeedOffloadOptimizerConfig, NoneType] = None,
    offload_param: Union[stoke.configs.DeepspeedOffloadParamConfig, NoneType] = None,
    overlap_comm: bool = False,
    reduce_bucket_size: int = 500000000,
    reduce_scatter: bool = True,
    stage: int = 0,
    stage3_max_live_parameters: int = 1000000000,
    stage3_max_reuse_distance: int = 1000000000,
    stage3_prefetch_bucket_size: int = 500000000,
    stage3_param_persistence_threshold: int = 1000000,
    stage3_gather_fp16_weights_on_model_save: bool = False,
    sub_group_size: int = 1000000000000
)

Attributes

Name Type Description Default
allgather_bucket_size int, default: int(5E8) Number of elements allgathered at a time. Limits the memory required for the allgather for large model sizes None
allgather_partitions bool, default: True Chooses between allgather collective or a series of broadcast collectives to gather updated parameters
from all the GPUs at the end of each step None
contiguous_gradients bool, default: False Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward
pass. Only useful when running very large models. None
ignore_unused_parameters bool, default: True Now just used in stage2 complete_grad_norm_calculation_for_cpu_offload
Enable this option to avoid -- https://github.com/microsoft/DeepSpeed/issues/707 None
legacy_stage1 bool, default: False Use deepspeed < v0.3.17 zero stage 1, kept for backwards compatability reasons None
offload_optimizer Optional[DeepspeedOffloadOptimizerConfig], default: None Enable offloading of optimizer state to CPU or NVMe, and optimizer computation to CPU. This frees up GPU
memory for larger models or batch sizes. Valid only with stage 3 None
offload_param Optional[DeepspeedOffloadParamConfig], default: None Enable offloading of model parameters to CPU or NVMe. This frees up GPU memory for larger models or batch
sizes. Valid only with stage 3. None
overlap_comm bool, default: False Attempts to overlap the reduction of the gradients with backward computation None
reduce_bucket_size int, default: int(5E8) Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large
model sizes None
reduce_scatter bool, default: True Uses reduce or reduce scatter instead of allreduce to average gradients None
stage int, default: 0 Chooses different stages of ZeRO Optimizer. Stage 0, 1, 2, and 3 refer to disabled, optimizer state
partitioning, and optimizer+gradient state partitioning, and optimizer+gradient+parameter partitioning,
respectively None
stage3_max_live_parameters int, default: int(1E9) The maximum number of parameters resident per GPU before releasing. Smaller values use less memory, but
perform more communication. None
stage3_max_reuse_distance int, default: int(1E9) Do not release a parameter if it will be reused within this threshold of parameters. Smaller values use less
memory, but perform more communication. None
stage3_prefetch_bucket_size int, default: int(5E8) The size of the fixed buffer for prefetching parameters. Smaller values use less memory, but can increase
stalls due to communication. None
stage3_param_persistence_threshold int, default: int(1E6) Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly
increase communication (especially latency-bound messages). None
stage3_gather_fp16_weights_on_model_save bool, default: False Consolidate the weights before saving the model by save_fp16_model(). Since the weights are partitioned
across GPUs, they aren’t part of state_dict, so this function automatically gather the weights when this
option is enabled and then saves the fp16 model weights. None
sub_group_size int, default: int(1E12) sub_group_size controls the granularity in which parameters are updated during optimizer steps. Parameters are
grouped into buckets of sub_group_size and each buckets is updated one at a time. None

??? example "View Source" class DeepspeedZeROConfig:

        """Deepspeed ZeRO configuration class



        Attributes

        ----------

        allgather_bucket_size: int, default: int(5E8)

            Number of elements allgathered at a time. Limits the memory required for the allgather for large model sizes

        allgather_partitions: bool, default: True

            Chooses between allgather collective or a series of broadcast collectives to gather updated parameters

            from all the GPUs at the end of each step

        contiguous_gradients: bool, default: False

            Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward

            pass. Only useful when running very large models.

        ignore_unused_parameters: bool, default: True

            Now just used in stage2 complete_grad_norm_calculation_for_cpu_offload

            Enable this option to avoid -- https://github.com/microsoft/DeepSpeed/issues/707

        legacy_stage1: bool, default: False

            Use deepspeed < v0.3.17 zero stage 1, kept for backwards compatability reasons

        offload_optimizer: Optional[DeepspeedOffloadOptimizerConfig], default: None

            Enable offloading of optimizer state to CPU or NVMe, and optimizer computation to CPU. This frees up GPU

            memory for larger models or batch sizes. Valid only with stage 3

        offload_param: Optional[DeepspeedOffloadParamConfig], default: None

            Enable offloading of model parameters to CPU or NVMe. This frees up GPU memory for larger models or batch

            sizes. Valid only with stage 3.

        overlap_comm: bool, default: False

            Attempts to overlap the reduction of the gradients with backward computation

        reduce_bucket_size: int, default: int(5E8)

            Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large

            model sizes

        reduce_scatter: bool, default: True

            Uses reduce or reduce scatter instead of allreduce to average gradients

        stage: int, default: 0

            Chooses different stages of ZeRO Optimizer. Stage 0, 1, 2, and 3 refer to disabled, optimizer state

            partitioning, and optimizer+gradient state partitioning, and optimizer+gradient+parameter partitioning,

            respectively

        stage3_max_live_parameters: int, default: int(1E9)

            The maximum number of parameters resident per GPU before releasing. Smaller values use less memory, but

            perform more communication.

        stage3_max_reuse_distance: int, default: int(1E9)

            Do not release a parameter if it will be reused within this threshold of parameters. Smaller values use less

            memory, but perform more communication.

        stage3_prefetch_bucket_size: int, default: int(5E8)

            The size of the fixed buffer for prefetching parameters. Smaller values use less memory, but can increase

            stalls due to communication.

        stage3_param_persistence_threshold: int, default: int(1E6)

            Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly

            increase communication (especially latency-bound messages).

        stage3_gather_fp16_weights_on_model_save: bool, default: False

            Consolidate the weights before saving the model by save_fp16_model(). Since the weights are partitioned

            across GPUs, they aren’t part of state_dict, so this function automatically gather the weights when this

            option is enabled and then saves the fp16 model weights.

        sub_group_size: int, default: int(1E12)

            sub_group_size controls the granularity in which parameters are updated during optimizer steps. Parameters are

            grouped into buckets of sub_group_size and each buckets is updated one at a time.



        """



        allgather_bucket_size: int = int(5e8)

        allgather_partitions: bool = True

        contiguous_gradients: bool = False

        ignore_unused_parameters: bool = True

        legacy_stage1: bool = False

        offload_optimizer: Optional[DeepspeedOffloadOptimizerConfig] = None

        offload_param: Optional[DeepspeedOffloadParamConfig] = None

        overlap_comm: bool = False

        reduce_bucket_size: int = int(5e8)

        reduce_scatter: bool = True

        stage: int = 0

        stage3_max_live_parameters: int = int(1e9)

        stage3_max_reuse_distance: int = int(1e9)

        stage3_prefetch_bucket_size: int = int(5e8)

        stage3_param_persistence_threshold: int = int(1e6)

        stage3_gather_fp16_weights_on_model_save: bool = False

        sub_group_size: int = int(1e12)

FairscaleFSDPConfig

class FairscaleFSDPConfig(
    bucket_cap_mb: int = 25,
    buffer_dtype: Union[torch.dtype, NoneType] = None,
    clear_autocast_cache: bool = False,
    compute_dtype: Union[torch.dtype, NoneType] = None,
    flatten_parameters: bool = True,
    force_input_to_fp32: bool = False,
    fp32_reduce_scatter: bool = False,
    gradient_predivide_factor: Union[float, NoneType] = None,
    gradient_postdivide_factor: Union[float, NoneType] = None,
    move_grads_to_cpu: Union[bool, NoneType] = None,
    move_params_to_cpu: bool = False,
    no_broadcast_optim_state: Union[bool, NoneType] = False,
    reshard_after_forward: bool = True,
    verbose: bool = False
)

Attributes

Name Type Description Default
bucket_cap_mb int, default: 25 FSDP will bucket parameters so that gradient reduction can be more efficient for small parameters.
bucket_cap_mb controls the bucket size in MegaBytes (MB). Buckets are sub-divided based on world_size, so the
max shard size is roughly bucket_cap_mb / world_size. There is one bucketer (with potentially multiple
bucket_cap_mb sized buffers shared by all FSDP instances. Large gradient tensors are directly reduced without
using the buffers. The buffers are there to reduce communication overhead for small tensors. Overlapping with
computation happens due to use of a different CUDA stream than the computation CUDA stream. The total memory
overhead per buffer is around bucket_cap_mb / world_size * (world_size + 1). The buffers are allocated during
the backward pass and freed at the end of the backward pass to save more memory for other phases of the
training process. Note, the memory vs. speed tradeoff of bucket size is very different from that of the DDP
engine. In DDP, the buffer size 1MB + n*cap_mb, until n is big enough to cover the entire model size. The
order of which buffer is ready there is more rigid and DDP requires all gradients to be computed in the
backward. In FSDP, the buffer size does not change with model size (it changes based on number of
tuples) and gradient ready order matters little since FSDP has a final flush
call that ensures everything is reduced and not all gradients need to be upfront known. Overlapping with
compute is done differently too. Values <= 0 disable bucketing None
buffer_dtype Optional[torch.dtype], default: None dtype for buffers for computation. defaults to value of compute_dtype value
clear_autocast_cache bool, default: False When using mixed precision training with FP16 AMP, if the model weights are in FP32, autocast
maintains a cache for downcasted weights. The cache can cause GPU OOM during the forward pass. Setting this
flag to true will help clearing this cache as inner FSDP instances finish part of the forward pass to save
GPU memory None
compute_dtype Optional[torch.dtype], default: None dtype for full parameters for computation. This defaults to torch.float32 unless FP 16 AMP is set,
in which case it defaults to torch.float16. torch.float32
flatten_parameters bool, default: True flatten parameters into a single contiguous tensor, which improves training speed None
force_input_to_fp32 bool, default: False: force input floating point tensors to be FP32 (if they are FP16) when the FSDP instance is in full precision
mode. This helps avoid issues of running SyncBatchNorm with AMP and checkpoint_wrapper. None
fp32_reduce_scatter bool, default: False reduce-scatter gradients in FP32. This is only relevant when FP16 AMP is used None
gradient_predivide_factor Optional[float], default: None divide factor before the reduction None
gradient_postdivide_factor Optional[float], default: None divide factor after the reduction None
move_grads_to_cpu Optional[bool], default: None move gradient shard to CPU after reduction. This is only relevant when FP16 AMP is used None
move_params_to_cpu bool, default: False offload FP32 params to CPU. This is only relevant when FP16 AMP is used None
no_broadcast_optim_state Optional[bool], default: False do not broadcast this modules optimizer state when gather_full_optim_state_dict is called. If you set this
true, you are expected to overwrite the relevant state entries of the returned optimizer state dict with the
proper state at each rank. This is useful for situations, like Mixture Of Experts, where all but a few
parameters can fit on one node None
reshard_after_forward bool, default: True reshard parameters after the forward pass. This saves memory but slows training. This is only relevant
when resharding individual layers (see https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html) None
verbose bool, default: True turn on verbose output for model’s string representation None

??? example "View Source" class FairscaleFSDPConfig:

        """Fairscale Fully Sharded Data Parallel configuration class



        Attributes

        ----------

        bucket_cap_mb: int, default: 25

            FSDP will bucket parameters so that gradient reduction can be more efficient for small parameters.

            bucket_cap_mb controls the bucket size in MegaBytes (MB). Buckets are sub-divided based on world_size, so the

            max shard size is roughly bucket_cap_mb / world_size. There is one bucketer (with potentially multiple

            bucket_cap_mb sized buffers shared by all FSDP instances. Large gradient tensors are directly reduced without

            using the buffers. The buffers are there to reduce communication overhead for small tensors. Overlapping with

            computation happens due to use of a different CUDA stream than the computation CUDA stream. The total memory

            overhead per buffer is around bucket_cap_mb / world_size * (world_size + 1). The buffers are allocated during

            the backward pass and freed at the end of the backward pass to save more memory for other phases of the

            training process. Note, the memory vs. speed tradeoff of bucket size is very different from that of the DDP

            engine. In DDP, the buffer size 1MB + n*cap_mb, until n is big enough to cover the entire model size. The

            order of which buffer is ready there is more rigid and DDP requires all gradients to be computed in the

            backward. In FSDP, the buffer size does not change with model size (it changes based on number of

            <dtype, device, process_group> tuples) and gradient ready order matters little since FSDP has a final flush

            call that ensures everything is reduced and not all gradients need to be upfront known. Overlapping with

            compute is done differently too. Values <= 0 disable bucketing

        buffer_dtype: Optional[torch.dtype], default: None

            dtype for buffers for computation. defaults to value of compute_dtype

        clear_autocast_cache: bool, default: False

            When using mixed precision training with FP16 AMP, if the model weights are in FP32, autocast

            maintains a cache for downcasted weights. The cache can cause GPU OOM during the forward pass. Setting this

            flag to true will help clearing this cache as inner FSDP instances finish part of the forward pass to save

            GPU memory

        compute_dtype: Optional[torch.dtype], default: None

            dtype for full parameters for computation. This defaults to torch.float32 unless FP 16 AMP is set,

            in which case it defaults to torch.float16.

        flatten_parameters: bool, default: True

            flatten parameters into a single contiguous tensor, which improves training speed

        force_input_to_fp32: bool, default: False:

            force input floating point tensors to be FP32 (if they are FP16) when the FSDP instance is in full precision

            mode. This helps avoid issues of running SyncBatchNorm with AMP and checkpoint_wrapper.

        fp32_reduce_scatter: bool, default: False

            reduce-scatter gradients in FP32. This is only relevant when FP16 AMP is used

        gradient_predivide_factor: Optional[float], default: None

            divide factor before the reduction

        gradient_postdivide_factor: Optional[float], default: None

            divide factor after the reduction

        move_grads_to_cpu: Optional[bool], default: None

            move gradient shard to CPU after reduction. This is only relevant when FP16 AMP is used

        move_params_to_cpu: bool, default: False

            offload FP32 params to CPU. This is only relevant when FP16 AMP is used

        no_broadcast_optim_state: Optional[bool], default: False

            do not broadcast this modules optimizer state when gather_full_optim_state_dict is called. If you set this

            true, you are expected to overwrite the relevant state entries of the returned optimizer state dict with the

            proper state at each rank. This is useful for situations, like Mixture Of Experts, where all but a few

            parameters can fit on one node

        reshard_after_forward: bool, default: True

            reshard parameters after the forward pass. This saves memory but slows training. This is only relevant

            when resharding individual layers (see https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html)

        verbose: bool, default: True

            turn on verbose output for model’s string representation



        Notes

        -----

        mixed_precision: bool

            This value will automatically be set from the Stoke FP16 selected option (AMP only)

        state_dict_device: torch.device

            this is not exposed as it should be managed internally from the DDP backend setup

        compute_device: torch.device

            this is not exposed as it should be managed internally from the DDP backend setup



        """



        bucket_cap_mb: int = 25

        buffer_dtype: Optional[torch.dtype] = None

        clear_autocast_cache: bool = False

        compute_dtype: Optional[torch.dtype] = None

        flatten_parameters: bool = True

        force_input_to_fp32: bool = False

        fp32_reduce_scatter: bool = False

        gradient_predivide_factor: Optional[float] = None

        gradient_postdivide_factor: Optional[float] = None

        move_grads_to_cpu: Optional[bool] = None

        move_params_to_cpu: bool = False

        no_broadcast_optim_state: Optional[bool] = False

        reshard_after_forward: bool = True

        verbose: bool = False

Descendants

  • stoke.extensions._FairscaleFSDPConfig

FairscaleOSSConfig

class FairscaleOSSConfig(
    broadcast_fp16: bool = False
)

Attributes

Name Type Description Default
broadcast_fp16 bool, default: False Compress the model shards in fp16 before sharing them in between ranks. This is safe to use when PyTorch AMP
is activated. Without torch AMP this will lead to a slight degradation in terms of accuracy. None

??? example "View Source" class FairscaleOSSConfig:

        """Fairscale optimizer state sharding configuration class



        Attributes

        ----------

        broadcast_fp16: bool, default: False

            Compress the model shards in fp16 before sharing them in between ranks. This is safe to use when PyTorch AMP

            is activated. Without torch AMP this will lead to a slight degradation in terms of accuracy.



        """



        broadcast_fp16: bool = False

FairscaleSDDPConfig

class FairscaleSDDPConfig(
    auto_refresh_trainable: bool = True,
    broadcast_buffers: bool = True,
    reduce_buffer_size: int = 8388608,
    reduce_fp16: bool = False,
    sync_models_at_startup: bool = True
)

Attributes

Name Type Description Default
auto_refresh_trainable bool, default: True Check whether the parameters trainability (requires_grad) has changed and update both ShardedDDP and OSS
automatically if this is the case. If set to False, refresh_trainable() needs to be called anytime a
parameter is frozen or unfrozen None
broadcast_buffers bool, default: True Whether to additionally broadcast model buffers in between ranks at the beginning of each forward pass. Same
setting as in Pytorch DDP, this is in addition to the broadcast and reduction of the model parameters. None
reduce_buffer_size int, default: 2 ** 23 he max size of the buffer used to batch the small parameter tensors, in number of elements. This will impact
the long term memory consumption, because these buckets correspond to parameters which will not be sharded.
Set to 0 to remove all bucketing, 1M to 8M is usually reasonable. None
reduce_fp16 bool, default: False cast the grads to fp16 before reducing. Not needed if the model is already fp16, but will probably improve
performance for multi node jobs using PyTorch AMP. The effect is similar to DDP’s fp16_compress_hook and
will also save some memory. None
sync_models_at_startup bool, default: True Synchronize the models in between the ranks when starting up. Not needed if each rank has the same seed, or
the training restarts from a saved state None

??? example "View Source" class FairscaleSDDPConfig:

        """Fairscale sharded data parallel (SDDP) configuration class



        Attributes

        ----------

        auto_refresh_trainable: bool, default: True

            Check whether the parameters trainability (requires_grad) has changed and update both ShardedDDP and OSS

            automatically if this is the case. If set to False, refresh_trainable() needs to be called anytime a

            parameter is frozen or unfrozen

        broadcast_buffers: bool, default: True

            Whether to additionally broadcast model buffers in between ranks at the beginning of each forward pass. Same

            setting as in Pytorch DDP, this is in addition to the broadcast and reduction of the model parameters.

        reduce_buffer_size: int, default: 2 ** 23

            he max size of the buffer used to batch the small parameter tensors, in number of elements. This will impact

            the long term memory consumption, because these buckets correspond to parameters which will not be sharded.

            Set to 0 to remove all bucketing, 1M to 8M is usually reasonable.

        reduce_fp16: bool, default: False

            cast the grads to fp16 before reducing. Not needed if the model is already fp16, but will probably improve

            performance for multi node jobs using PyTorch AMP. The effect is similar to DDP’s fp16_compress_hook and

            will also save some memory.

        sync_models_at_startup: bool, default: True

            Synchronize the models in between the ranks when starting up. Not needed if each rank has the same seed, or

            the training restarts from a saved state



        """



        auto_refresh_trainable: bool = True

        broadcast_buffers: bool = True

        reduce_buffer_size: int = 2 ** 23

        reduce_fp16: bool = False

        sync_models_at_startup: bool = True

HorovodConfig

class HorovodConfig(
    compression: bool = False,
    convert_to_sync_batch_norm: bool = False,
    gradient_predivide_factor: float = 1.0,
    op: stoke.configs.HorovodOps = 'Average'
)

Attributes

Name Type Description Default
compression bool, default: False Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter
update step. None
convert_to_sync_batch_norm bool, default: False Automatically convert all batch norm calls to horovod.torch.SyncBatchNorm calls
https://horovod.readthedocs.io/en/stable/api.html#horovod.torch.SyncBatchNorm None
gradient_predivide_factor float, default: 1.0 If op == Average, gradient_predivide_factor splits the averaging before and after the sum. Gradients are scaled
by 1.0 / gradient_predivide_factor before the sum and gradient_predivide_factor / size after the sum. None
op HorovodOps, default: 'Average' The reduction operation to use when combining gradients across different ranks. None

??? example "View Source" class HorovodConfig:

        """Horovod configuration class



        Attributes

        ----------

        compression: bool, default: False

            Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter

            update step.

        convert_to_sync_batch_norm: bool, default: False

            Automatically convert all batch norm calls to horovod.torch.SyncBatchNorm calls

            https://horovod.readthedocs.io/en/stable/api.html#horovod.torch.SyncBatchNorm

        gradient_predivide_factor: float, default: 1.0

            If op == Average, gradient_predivide_factor splits the averaging before and after the sum. Gradients are scaled

            by 1.0 / gradient_predivide_factor before the sum and gradient_predivide_factor / size after the sum.

        op: HorovodOps, default: 'Average'

            The reduction operation to use when combining gradients across different ranks.



        """



        compression: bool = False

        convert_to_sync_batch_norm: bool = False

        gradient_predivide_factor: float = 1.0

        op: HorovodOps = "Average"

HorovodOps

class HorovodOps(
    /,
    *args,
    **kwargs
)

??? example "View Source" class HorovodOps(Enum):

        """Horovod ops options"""



        Average = "Average"

        Sum = "Sum"

        Adasum = "Adasum"

Ancestors (in MRO)

  • enum.Enum

Class variables

Adasum
Average
Sum
name
value

OffloadDevice

class OffloadDevice(
    /,
    *args,
    **kwargs
)

??? example "View Source" class OffloadDevice(Enum):

        """Offload device options"""



        none = "none"

        cpu = "cpu"

        nvme = "nvme"

Ancestors (in MRO)

  • enum.Enum

Class variables

cpu
name
none
nvme
value

StokeOptimizer

class StokeOptimizer(
    /,
    *args,
    **kwargs
)

Attributes

Name Type Description Default
optimizer Type[torch.optim.Optimizer] un-instantiated torch.optim.Optimizer class None
optimizer_kwargs Dict any keyword args to be unrolled into the optimizer at instantiation time None

??? example "View Source" class StokeOptimizer(TypedDict):

        """Stoke optimizer wrapper class



        Given all the different backends and extensions the optimizer might need to be instantiated in a different way

        thus this typed dict holds the configuration without instantiation



        Attributes

        ----------

        optimizer: Type[torch.optim.Optimizer]

            un-instantiated torch.optim.Optimizer class

        optimizer_kwargs: Dict

            any keyword args to be unrolled into the optimizer at instantiation time



        """



        optimizer: Type[torch.optim.Optimizer]

        optimizer_kwargs: Dict

Ancestors (in MRO)

  • builtins.dict

Methods

clear

def clear(
    ...
)

D.clear() -> None. Remove all items from D.

copy

def copy(
    ...
)

D.copy() -> a shallow copy of D

fromkeys

def fromkeys(
    iterable,
    value=None,
    /
)

Create a new dictionary with keys from iterable and values set to value.

get

def get(
    self,
    key,
    default=None,
    /
)

Return the value for key if key is in the dictionary, else default.

items

def items(
    ...
)

D.items() -> a set-like object providing a view on D's items

keys

def keys(
    ...
)

D.keys() -> a set-like object providing a view on D's keys

pop

def pop(
    ...
)

D.pop(k[,d]) -> v, remove specified key and return the corresponding value.

If key is not found, d is returned if given, otherwise KeyError is raised

popitem

def popitem(
    self,
    /
)

Remove and return a (key, value) pair as a 2-tuple.

Pairs are returned in LIFO (last-in, first-out) order. Raises KeyError if the dict is empty.

setdefault

def setdefault(
    self,
    key,
    default=None,
    /
)

Insert key with a value of default if key is not in the dictionary.

Return the value for key if key is in the dictionary, else default.

update

def update(
    ...
)

D.update([E, ]**F) -> None. Update D from dict/iterable E and F.

If E is present and has a .keys() method, then does: for k in E: D[k] = E[k] If E is present and lacks a .keys() method, then does: for k, v in E: D[k] = v In either case, this is followed by: for k in F: D[k] = F[k]

values

def values(
    ...
)

D.values() -> an object providing a view on D's values

Back to top