Module stoke.configs

Handles all config objects

None

??? example "View Source" # -- coding: utf-8 --

    # Copyright FMR LLC <opensource@fidelity.com>

    # SPDX-License-Identifier: Apache-2.0



    """Handles all config objects"""



    from enum import Enum

    from typing import Dict, Optional, Type



    import attr

    import torch



    try:

        from typing import TypedDict

    except ImportError:

        from mypy_extensions import TypedDict





    class HorovodOps(Enum):

        """Horovod ops options"""



        Average = "Average"

        Sum = "Sum"

        Adasum = "Adasum"





    class OffloadDevice(Enum):

        """Offload device options"""



        none = "none"

        cpu = "cpu"

        nvme = "nvme"





    class BackendOptions(Enum):

        """Communication backend options"""



        nccl = "nccl"

        mpi = " mpi"

        gloo = "gloo"





    @attr.s(auto_attribs=True)

    class AMPConfig:

        """PyTorch AMP configuration class



        Attributes

        ----------

        backoff_factor : float, default: 0.5

            Factor by which the scale is multiplied during update if inf/NaN gradients occur in an iteration

        growth_factor : float, default: 2.0

            Factor by which the scale is multiplied during update if no inf/NaN gradients occur for growth_interval consecutive iterations.

        growth_interval : int, default: 2000

            Number of consecutive iterations without inf/NaN gradients that must occur for the scale to be multiplied by

            growth_factor

        init_scale : float, default: 2.**16

            Initial scale factor



        """



        backoff_factor: float = 0.5

        growth_factor: float = 2.0

        growth_interval: int = 2000

        init_scale: float = 2.0 ** 16





    @attr.s(auto_attribs=True)

    class ApexConfig:

        """Nvidia APEX configuration class



        Attributes

        ----------

        cast_model_outputs: Optional[torch.dtype], default: None

            Option to ensure that the outputs of your model(s) are always cast to a particular type regardless of opt_level

        convert_to_sync_batch_norm: bool, default: False

            Automatically convert all batch norm calls to apex.parallel.SyncBatchNorm calls

            https://nvidia.github.io/apex/parallel.html#apex.parallel.SyncBatchNorm

        max_loss_scale: float, default: 2.**24

            Sets a ceiling for the loss scale values that can be chosen by dynamic loss scaling

        min_loss_scale: Optional[float], default: None

            Sets a floor for the loss scale values that can be chosen by dynamic loss scaling. The default value of None

            means that no floor is imposed

        scaler_per_loss: bool, default: False

            Option to impose a scaler for each loss instead of a global scaler

        verbosity: int, default: 0

            Set to 0 to suppress Amp-related output



        """



        cast_model_outputs: Optional[torch.dtype] = None

        convert_to_sync_batch_norm: bool = False

        max_loss_scale: float = 2.0 ** 24

        min_loss_scale: Optional[float] = None

        scaler_per_loss: bool = False

        verbosity: int = 0





    @attr.s(auto_attribs=True)

    class ClipGradConfig:

        """Gradient clipping by value configuration class



        Attributes

        ----------

        clip_value: float

            maximum allowed absolute value of the gradients [-clip_value, clip_value]



        """



        clip_value: float





    @attr.s(auto_attribs=True)

    class ClipGradNormConfig:

        """Gradient clipping by p-norm configuration class



        Attributes

        ----------

        max_norm: float

            max norm of the gradients

        norm_type: float

            type of the used p-norm



        """



        max_norm: float

        norm_type: float





    @attr.s(auto_attribs=True)

    class DDPConfig:

        """PyTorch DistributedDataParallel configuration class



        Attributes

        ----------

        local_rank: Optional[int]

            Current local rank of the device (provided here, as LOCAL_RANK env var, or parsed from --local_arg)

        auto_mpi_discovery: bool, default: False

            if distributed environment variables are not set, attempt to discover them from MPI (using underlying deepspeed

            function call)

        convert_to_sync_batch_norm: bool, default: False

            Automatically convert all batch norm calls to torch.nn.SyncBatchNorm calls

            https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html

        backend: BackendOptions, default: 'nccl'

            Which communication backend to use

        broadcast_buffers: bool, default: True

            Flag that enables syncing (broadcasting) buffers of the module at beginning of the forward function

        bucket_cap_mb: int, default: 25

            DistributedDataParallel will bucket parameters into multiple buckets so that gradient reduction of each bucket

            can potentially overlap with backward computation. bucket_cap_mb controls the bucket size in MegaBytes (MB)

        find_unused_parameters: bool, default: False

            Traverse the autograd graph from all tensors contained in the return value of the wrapped module’s forward

            function. Parameters that don’t receive gradients as part of this graph are preemptively marked as being ready

            to be reduced. Note that all forward outputs that are derived from module parameters must participate in

            calculating loss and later the gradient computation. If they don’t, this wrapper will hang waiting for autograd

            to produce gradients for those parameters. Any outputs derived from module parameters that are otherwise unused

            can be detached from the autograd graph using torch.Tensor.detach

        gradient_as_bucket_view: bool, default: False

            When set to True, gradients will be views pointing to different offsets of allreduce communication

            buckets. This can reduce peak memory usage, where the saved memory size will be equal to the total gradients

            size. Moreover, it avoids the overhead of copying between gradients and allreduce communication buckets. When

            gradients are views, detach_() cannot be called on the gradients. If hitting such errors, please fix it by

            referring to the zero_grad() function in torch/optim/optimizer.py as a solution.

        init_method: str, default: 'env://'

            URL specifying how to initialize the process group

        no_sync: bool, default: True

            for any DDP based method (including SDDP and FSDP wrappers -- if activated gradients will be accumulated on

            module variables, which will later be synchronized in the first forward-backward pass after exiting the

            context. no sync might lead to higher memory usage but lower communication overhead



        """



        local_rank: Optional[int]

        auto_mpi_discovery: bool = False

        convert_to_sync_batch_norm: bool = False

        backend: BackendOptions = "nccl"

        broadcast_buffers: bool = True

        bucket_cap_mb: int = 25

        find_unused_parameters: bool = False

        gradient_as_bucket_view: bool = False

        init_method: str = "env://"

        no_sync: bool = True





    @attr.s(auto_attribs=True)

    class DeepspeedAIOConfig:

        """Deepspeed asynchronous I/O configuration class



        Attributes

        ----------

        block_size: int, default: 1048576

            I/O block size in bytes

        ignore_unused_parameters: bool, default: True

            Unused parameters in modules may be unexpected in static networks, but could be normal in dynamic networks.

            This controls whether or not training should terminate with an error message when unused parameters are

            detected.

        overlap_events: bool, default: True

            Submit requests to storage device in an overlapped fashion without waiting for completion of earlier requests.

        queue_depth: int, default: 8

            I/O queue depth

        single_submit: bool, default: False

            Submit requests to storage device as multiple individual requests as opposed to one block of requests.

        thread_count: int, default: 1

            Intra-request parallelism for each read/write submitted by a user thread.



        """



        block_size: int = 1048576

        ignore_unused_parameters: bool = True

        overlap_events: bool = True

        queue_depth: int = 8

        single_submit: bool = False

        thread_count: int = 1





    @attr.s(auto_attribs=True)

    class DeepspeedActivationCheckpointingConfig:

        """Deepspeed activation checkpointing configuration class



        Attributes

        ----------

        contiguous_memory_optimization: bool, default: False

            Copies partitioned activations so that they are contiguous in memory

        cpu_checkpointing: bool, default: False

            Offloads partitioned activations to CPU if partition_activations is enabled

        number_checkpoints: Optional[int], default: None

            Total number of activation checkpoints used to allocate memory buffer for contiguous_memoty_optimization

        partition_activations: bool, default: False

            Enables partition activation when used with model parallelism

        profile: bool, default: False

            Logs the forward and backward time for each checkpoint function

        synchronize_checkpoint_boundary: bool, default: False

            Inserts torch.cuda.synchronize() at each checkpoint boundary



        """



        contiguous_memory_optimization: bool = False

        cpu_checkpointing: bool = False

        number_checkpoints: Optional[int] = None

        partition_activations: bool = False

        profile: bool = False

        synchronize_checkpoint_boundary: bool = False





    @attr.s(auto_attribs=True)

    class DeepspeedFlopsConfig:

        """Deepspeed flops profiler configuration class



        Attributes

        ----------

        detailed: bool, default: True

            Whether to print the detailed model profile

        module_depth: int, default: -1

            The depth of the model at which to print the aggregated module information. When set to -1, it prints

            information from the top module to the innermost modules (the maximum depth).

        output_file: Optional[str], default: None

            Path to the output file. If None, the profiler prints to stdout

        profile_step: int, default: 1

            The global training step at which to profile.

        top_modules: int, default: 1

            Limits the aggregated profile output to the number of top modules specified.



        Notes

        -----

        Warm up steps are needed for accurate time measurement



        """



        detailed: bool = True

        module_depth: int = -1

        output_file: Optional[str] = None

        profile_step: int = 1

        top_modules: int = 1





    @attr.s(auto_attribs=True)

    class DeepspeedFP16Config:

        """Deepspeed FP16 configuration class



        Attributes

        ----------

        hysteresis: int, default: 2

            represents the delay shift in dynamic loss scaling

        initial_scale_power: int, default: 32

            power of the initial dynamic loss scale value. The actual loss scale is computed as 2 ** initial_scale_power

        loss_scale: float, default: 0.0

            loss scaling value for FP16 training (0.0 --> dynamic scaling)

        loss_scale_window: int, default: 1000

            the window over which to raise/lower the dynamic loss scale value

        min_loss_scale: int, default: 1000

            minimum dynamic loss scale value



        """



        hysteresis: int = 2

        initial_scale_power: int = 32

        loss_scale: float = 0.0

        loss_scale_window: int = 1000

        min_loss_scale: int = 1000





    @attr.s(auto_attribs=True)

    class DeepspeedOffloadOptimizerConfig:

        """Deepspeed optimizer offloading configuration class



        Attributes

        ----------

        buffer_count: int, default: 4

            Number of buffers in buffer pool for optimizer state offloading to NVMe. This should be at least the number

            of states maintained per parameter by the optimizer. For example, Adam optimizer has 4 states (parameter,

            gradient, momentum, and variance).

        device: OffloadDevice, default: 'cpu'

            Device memory to offload optimizer state

        fast_init: bool, default: False

            Enable fast optimizer initialization when offloading to NVMe

        nvme_path: str, default: '/local_nvme'

            Filesystem path for NVMe device for optimizer state offloading

        pin_memory: bool, default: False

            Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead.

        pipeline: bool, default: False

            pipeline activated (will default to True if either pipeline_read or pipeline_write is set

        pipeline_read: bool, default: False

            activate pipeline read (deepspeed has limited docs for what this does)

        pipeline_write: bool, default: False

            activate pipeline write(deepspeed has limited docs for what this does)



        """



        buffer_count: int = 4

        device: OffloadDevice = "cpu"

        fast_init: bool = False

        nvme_path: str = "/local_nvme"

        pin_memory: bool = False

        pipeline: bool = False

        pipeline_read: bool = False

        pipeline_write: bool = False





    @attr.s(auto_attribs=True)

    class DeepspeedOffloadParamConfig:

        """Deepspeed parameter offloading configuration class



        Attributes

        ----------

        buffer_count: int, default: 5

            Number of buffers in buffer pool for parameter offloading to NVMe

        buffer_size: int, default: int(1E8)

            Size of buffers in buffer pool for parameter offloading to NVMe

        device: OffloadDevice, default: 'cpu'

            Device memory to offload model parameters

        max_in_cpu: int, default: int(1E9)

            Number of parameter elements to maintain in CPU memory when offloading to NVMe is enabled.

        nvme_path: str, default: '/local_nvme'

            Filesystem path for NVMe device for parameter offloading

        pin_memory: bool, default: False

            Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead.



        """



        buffer_count: int = 5

        buffer_size: int = int(1e8)

        device: OffloadDevice = "cpu"

        max_in_cpu: int = int(1e9)

        nvme_path: str = "/local_nvme"

        pin_memory: bool = False





    @attr.s(auto_attribs=True)

    class DeepspeedPLDConfig:

        """

        Attributes

        ----------

        theta: float, default: 1.0

            Hyper-parameter that controls the trade-off between training time and robustness. The lower the theta value,

            the faster the training speed

        gamma: float, default: 0.001

            Hyper-parameter that controls how fast the drop ratio increases



        """



        theta: float = 1.0

        gamma: float = 0.001





    @attr.s(auto_attribs=True)

    class DeepspeedTensorboardConfig:

        """Deepspeed Tensorboard configuration class



        Attributes

        ----------

        output_path: str, default: ''

            Tensorboard output path

        job_name: str, default: 'DeepSpeedJobName'

            Tensorboard job name



        """



        output_path: str = ""

        job_name: str = "DeepSpeedJobName"





    @attr.s(auto_attribs=True)

    class DeepspeedZeROConfig:

        """Deepspeed ZeRO configuration class



        Attributes

        ----------

        allgather_bucket_size: int, default: int(5E8)

            Number of elements allgathered at a time. Limits the memory required for the allgather for large model sizes

        allgather_partitions: bool, default: True

            Chooses between allgather collective or a series of broadcast collectives to gather updated parameters

            from all the GPUs at the end of each step

        contiguous_gradients: bool, default: False

            Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward

            pass. Only useful when running very large models.

        ignore_unused_parameters: bool, default: True

            Now just used in stage2 complete_grad_norm_calculation_for_cpu_offload

            Enable this option to avoid -- https://github.com/microsoft/DeepSpeed/issues/707

        legacy_stage1: bool, default: False

            Use deepspeed < v0.3.17 zero stage 1, kept for backwards compatability reasons

        offload_optimizer: Optional[DeepspeedOffloadOptimizerConfig], default: None

            Enable offloading of optimizer state to CPU or NVMe, and optimizer computation to CPU. This frees up GPU

            memory for larger models or batch sizes. Valid only with stage 3

        offload_param: Optional[DeepspeedOffloadParamConfig], default: None

            Enable offloading of model parameters to CPU or NVMe. This frees up GPU memory for larger models or batch

            sizes. Valid only with stage 3.

        overlap_comm: bool, default: False

            Attempts to overlap the reduction of the gradients with backward computation

        reduce_bucket_size: int, default: int(5E8)

            Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large

            model sizes

        reduce_scatter: bool, default: True

            Uses reduce or reduce scatter instead of allreduce to average gradients

        stage: int, default: 0

            Chooses different stages of ZeRO Optimizer. Stage 0, 1, 2, and 3 refer to disabled, optimizer state

            partitioning, and optimizer+gradient state partitioning, and optimizer+gradient+parameter partitioning,

            respectively

        stage3_max_live_parameters: int, default: int(1E9)

            The maximum number of parameters resident per GPU before releasing. Smaller values use less memory, but

            perform more communication.

        stage3_max_reuse_distance: int, default: int(1E9)

            Do not release a parameter if it will be reused within this threshold of parameters. Smaller values use less

            memory, but perform more communication.

        stage3_prefetch_bucket_size: int, default: int(5E8)

            The size of the fixed buffer for prefetching parameters. Smaller values use less memory, but can increase

            stalls due to communication.

        stage3_param_persistence_threshold: int, default: int(1E6)

            Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly

            increase communication (especially latency-bound messages).

        stage3_gather_fp16_weights_on_model_save: bool, default: False

            Consolidate the weights before saving the model by save_fp16_model(). Since the weights are partitioned

            across GPUs, they aren’t part of state_dict, so this function automatically gather the weights when this

            option is enabled and then saves the fp16 model weights.

        sub_group_size: int, default: int(1E12)

            sub_group_size controls the granularity in which parameters are updated during optimizer steps. Parameters are

            grouped into buckets of sub_group_size and each buckets is updated one at a time.



        """



        allgather_bucket_size: int = int(5e8)

        allgather_partitions: bool = True

        contiguous_gradients: bool = False

        ignore_unused_parameters: bool = True

        legacy_stage1: bool = False

        offload_optimizer: Optional[DeepspeedOffloadOptimizerConfig] = None

        offload_param: Optional[DeepspeedOffloadParamConfig] = None

        overlap_comm: bool = False

        reduce_bucket_size: int = int(5e8)

        reduce_scatter: bool = True

        stage: int = 0

        stage3_max_live_parameters: int = int(1e9)

        stage3_max_reuse_distance: int = int(1e9)

        stage3_prefetch_bucket_size: int = int(5e8)

        stage3_param_persistence_threshold: int = int(1e6)

        stage3_gather_fp16_weights_on_model_save: bool = False

        sub_group_size: int = int(1e12)





    @attr.s(auto_attribs=True)

    class DeepspeedConfig:

        """Deepspeed configuration class



        Composed of other configuration classes related to specific functionality



        Attributes

        ----------

        activation_checkpointing: Optional[DeepspeedActivationCheckpointingConfig], default: DeepspeedActivationCheckpointingConfig()

            Enables and configures activation checkpointing

        aio: Optional[DeepspeedAIOConfig], default: DeepspeedAIOConfig()

            Configuring the asynchronous I/O module for offloading parameter and optimizer states to persistent

            (NVMe) storage

        auto_mpi_discovery: bool, default: True

            if distributed environment variables are not set, attempt to discover them from MPI

        disable_allgather: bool, default: False

            Disables allgather

        dist_backend: BackendOptions, default: 'nccl'

            Which communication backend to use

        distributed_port: int, default: 29500

            torch distributed backend port

        dump_state: bool, default: False

            Print out state information of DeepSpeed object after initialization

        flops_profiler: Optional[DeepspeedFlopsConfig], default: None

            Enables and configures the flops profiler. This would also enable wall_clock_breakdown

        fp16: Optional[DeepspeedFP16Config], default: None

            Enables and configures mixed precision/FP16 training that leverages NVIDIA’s Apex package

        fp32_allreduce: bool, default: False

            During gradient averaging perform allreduce with 32 bit values

        gradient_predivide_factor: float, default: 1.0

            Before gradient averaging predivide gradients by a specified factor, can sometimes help with fp16 stability

            when scaling to large numbers of GPUs

        init_method: str, default: 'env://'

            URL specifying how to initialize the process group

        prescale_gradients: float, default: 1.0

            Scale gradients before doing allreduce

        progressive_layer_drop: Optional[DeepspeedPLDConfig], default: None

            Enables and configures progressive layer dropping

        sparse_gradients: bool, default: False

            Enable sparse compression of torch.nn.Embedding gradients

        steps_per_print: int, default: 10

            Print train loss every N steps

        tensorboard: Optional[DeepspeedTensorboardConfig], default: None

            Enables and configures tensorboard support

        verbose: bool, default: True

            flag to make deepspeed engine verbose with information

        wall_clock_breakdown: bool, default: False

            Enable timing of the latency of forward/backward/update training phases

        zero_optimization: Optional[DeepspeedZeROConfig], default: DeepspeedZeROConfig()

            Enables and configures ZeRO memory optimizations



        Notes

        -----

        Deepspeed does not use Apex’s AMP mode whihc allows for more flexibility in mixed precision training modes. FP16

        here is similar to AMP’s O2 mode



        """



        activation_checkpointing: Optional[

            DeepspeedActivationCheckpointingConfig

        ] = DeepspeedActivationCheckpointingConfig()

        aio: Optional[DeepspeedAIOConfig] = DeepspeedAIOConfig()

        auto_mpi_discovery: bool = True

        disable_allgather: bool = False

        dist_backend: BackendOptions = "nccl"

        distributed_port: int = 29500

        dump_state: bool = False

        flops_profiler: Optional[DeepspeedFlopsConfig] = None

        fp16: Optional[DeepspeedFP16Config] = None

        fp32_allreduce: bool = False

        gradient_predivide_factor: float = 1.0

        init_method: str = "env://"

        prescale_gradients: bool = False

        progressive_layer_drop: Optional[DeepspeedPLDConfig] = None

        sparse_gradients: bool = False

        steps_per_print: int = 10

        tensorboard: Optional[DeepspeedTensorboardConfig] = None

        verbose: bool = True

        wall_clock_breakdown: bool = False

        zero_optimization: Optional[DeepspeedZeROConfig] = DeepspeedZeROConfig()





    @attr.s(auto_attribs=True)

    class FairscaleOSSConfig:

        """Fairscale optimizer state sharding configuration class



        Attributes

        ----------

        broadcast_fp16: bool, default: False

            Compress the model shards in fp16 before sharing them in between ranks. This is safe to use when PyTorch AMP

            is activated. Without torch AMP this will lead to a slight degradation in terms of accuracy.



        """



        broadcast_fp16: bool = False





    @attr.s(auto_attribs=True)

    class FairscaleSDDPConfig:

        """Fairscale sharded data parallel (SDDP) configuration class



        Attributes

        ----------

        auto_refresh_trainable: bool, default: True

            Check whether the parameters trainability (requires_grad) has changed and update both ShardedDDP and OSS

            automatically if this is the case. If set to False, refresh_trainable() needs to be called anytime a

            parameter is frozen or unfrozen

        broadcast_buffers: bool, default: True

            Whether to additionally broadcast model buffers in between ranks at the beginning of each forward pass. Same

            setting as in Pytorch DDP, this is in addition to the broadcast and reduction of the model parameters.

        reduce_buffer_size: int, default: 2 ** 23

            he max size of the buffer used to batch the small parameter tensors, in number of elements. This will impact

            the long term memory consumption, because these buckets correspond to parameters which will not be sharded.

            Set to 0 to remove all bucketing, 1M to 8M is usually reasonable.

        reduce_fp16: bool, default: False

            cast the grads to fp16 before reducing. Not needed if the model is already fp16, but will probably improve

            performance for multi node jobs using PyTorch AMP. The effect is similar to DDP’s fp16_compress_hook and

            will also save some memory.

        sync_models_at_startup: bool, default: True

            Synchronize the models in between the ranks when starting up. Not needed if each rank has the same seed, or

            the training restarts from a saved state



        """



        auto_refresh_trainable: bool = True

        broadcast_buffers: bool = True

        reduce_buffer_size: int = 2 ** 23

        reduce_fp16: bool = False

        sync_models_at_startup: bool = True





    @attr.s(auto_attribs=True)

    class FairscaleFSDPConfig:

        """Fairscale Fully Sharded Data Parallel configuration class



        Attributes

        ----------

        bucket_cap_mb: int, default: 25

            FSDP will bucket parameters so that gradient reduction can be more efficient for small parameters.

            bucket_cap_mb controls the bucket size in MegaBytes (MB). Buckets are sub-divided based on world_size, so the

            max shard size is roughly bucket_cap_mb / world_size. There is one bucketer (with potentially multiple

            bucket_cap_mb sized buffers shared by all FSDP instances. Large gradient tensors are directly reduced without

            using the buffers. The buffers are there to reduce communication overhead for small tensors. Overlapping with

            computation happens due to use of a different CUDA stream than the computation CUDA stream. The total memory

            overhead per buffer is around bucket_cap_mb / world_size * (world_size + 1). The buffers are allocated during

            the backward pass and freed at the end of the backward pass to save more memory for other phases of the

            training process. Note, the memory vs. speed tradeoff of bucket size is very different from that of the DDP

            engine. In DDP, the buffer size 1MB + n*cap_mb, until n is big enough to cover the entire model size. The

            order of which buffer is ready there is more rigid and DDP requires all gradients to be computed in the

            backward. In FSDP, the buffer size does not change with model size (it changes based on number of

            <dtype, device, process_group> tuples) and gradient ready order matters little since FSDP has a final flush

            call that ensures everything is reduced and not all gradients need to be upfront known. Overlapping with

            compute is done differently too. Values <= 0 disable bucketing

        buffer_dtype: Optional[torch.dtype], default: None

            dtype for buffers for computation. defaults to value of compute_dtype

        clear_autocast_cache: bool, default: False

            When using mixed precision training with FP16 AMP, if the model weights are in FP32, autocast

            maintains a cache for downcasted weights. The cache can cause GPU OOM during the forward pass. Setting this

            flag to true will help clearing this cache as inner FSDP instances finish part of the forward pass to save

            GPU memory

        compute_dtype: Optional[torch.dtype], default: None

            dtype for full parameters for computation. This defaults to torch.float32 unless FP 16 AMP is set,

            in which case it defaults to torch.float16.

        flatten_parameters: bool, default: True

            flatten parameters into a single contiguous tensor, which improves training speed

        force_input_to_fp32: bool, default: False:

            force input floating point tensors to be FP32 (if they are FP16) when the FSDP instance is in full precision

            mode. This helps avoid issues of running SyncBatchNorm with AMP and checkpoint_wrapper.

        fp32_reduce_scatter: bool, default: False

            reduce-scatter gradients in FP32. This is only relevant when FP16 AMP is used

        gradient_predivide_factor: Optional[float], default: None

            divide factor before the reduction

        gradient_postdivide_factor: Optional[float], default: None

            divide factor after the reduction

        move_grads_to_cpu: Optional[bool], default: None

            move gradient shard to CPU after reduction. This is only relevant when FP16 AMP is used

        move_params_to_cpu: bool, default: False

            offload FP32 params to CPU. This is only relevant when FP16 AMP is used

        no_broadcast_optim_state: Optional[bool], default: False

            do not broadcast this modules optimizer state when gather_full_optim_state_dict is called. If you set this

            true, you are expected to overwrite the relevant state entries of the returned optimizer state dict with the

            proper state at each rank. This is useful for situations, like Mixture Of Experts, where all but a few

            parameters can fit on one node

        reshard_after_forward: bool, default: True

            reshard parameters after the forward pass. This saves memory but slows training. This is only relevant

            when resharding individual layers (see https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html)

        verbose: bool, default: True

            turn on verbose output for model’s string representation



        Notes

        -----

        mixed_precision: bool

            This value will automatically be set from the Stoke FP16 selected option (AMP only)

        state_dict_device: torch.device

            this is not exposed as it should be managed internally from the DDP backend setup

        compute_device: torch.device

            this is not exposed as it should be managed internally from the DDP backend setup



        """



        bucket_cap_mb: int = 25

        buffer_dtype: Optional[torch.dtype] = None

        clear_autocast_cache: bool = False

        compute_dtype: Optional[torch.dtype] = None

        flatten_parameters: bool = True

        force_input_to_fp32: bool = False

        fp32_reduce_scatter: bool = False

        gradient_predivide_factor: Optional[float] = None

        gradient_postdivide_factor: Optional[float] = None

        move_grads_to_cpu: Optional[bool] = None

        move_params_to_cpu: bool = False

        no_broadcast_optim_state: Optional[bool] = False

        reshard_after_forward: bool = True

        verbose: bool = False





    @attr.s(auto_attribs=True)

    class HorovodConfig:

        """Horovod configuration class



        Attributes

        ----------

        compression: bool, default: False

            Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter

            update step.

        convert_to_sync_batch_norm: bool, default: False

            Automatically convert all batch norm calls to horovod.torch.SyncBatchNorm calls

            https://horovod.readthedocs.io/en/stable/api.html#horovod.torch.SyncBatchNorm

        gradient_predivide_factor: float, default: 1.0

            If op == Average, gradient_predivide_factor splits the averaging before and after the sum. Gradients are scaled

            by 1.0 / gradient_predivide_factor before the sum and gradient_predivide_factor / size after the sum.

        op: HorovodOps, default: 'Average'

            The reduction operation to use when combining gradients across different ranks.



        """



        compression: bool = False

        convert_to_sync_batch_norm: bool = False

        gradient_predivide_factor: float = 1.0

        op: HorovodOps = "Average"





    class StokeOptimizer(TypedDict):

        """Stoke optimizer wrapper class



        Given all the different backends and extensions the optimizer might need to be instantiated in a different way

        thus this typed dict holds the configuration without instantiation



        Attributes

        ----------

        optimizer: Type[torch.optim.Optimizer]

            un-instantiated torch.optim.Optimizer class

        optimizer_kwargs: Dict

            any keyword args to be unrolled into the optimizer at instantiation time



        """



        optimizer: Type[torch.optim.Optimizer]

        optimizer_kwargs: Dict

Classes

AMPConfig

class AMPConfig(
    backoff_factor: float = 0.5,
    growth_factor: float = 2.0,
    growth_interval: int = 2000,
    init_scale: float = 65536.0
)

Attributes

Name	Type	Description	Default
backoff_factor	float, default: 0.5	Factor by which the scale is multiplied during update if inf/NaN gradients occur in an iteration	None
growth_factor	float, default: 2.0	Factor by which the scale is multiplied during update if no inf/NaN gradients occur for growth_interval consecutive iterations.	None
growth_interval	int, default: 2000	Number of consecutive iterations without inf/NaN gradients that must occur for the scale to be multiplied by
growth_factor	None
init_scale	float, default: 2.**16	Initial scale factor	None

??? example "View Source" class AMPConfig:

        """PyTorch AMP configuration class



        Attributes

        ----------

        backoff_factor : float, default: 0.5

            Factor by which the scale is multiplied during update if inf/NaN gradients occur in an iteration

        growth_factor : float, default: 2.0

            Factor by which the scale is multiplied during update if no inf/NaN gradients occur for growth_interval consecutive iterations.

        growth_interval : int, default: 2000

            Number of consecutive iterations without inf/NaN gradients that must occur for the scale to be multiplied by

            growth_factor

        init_scale : float, default: 2.**16

            Initial scale factor



        """



        backoff_factor: float = 0.5

        growth_factor: float = 2.0

        growth_interval: int = 2000

        init_scale: float = 2.0 ** 16

ApexConfig

class ApexConfig(
    cast_model_outputs: Union[torch.dtype, NoneType] = None,
    convert_to_sync_batch_norm: bool = False,
    max_loss_scale: float = 16777216.0,
    min_loss_scale: Union[float, NoneType] = None,
    scaler_per_loss: bool = False,
    verbosity: int = 0
)

Attributes

Name	Type	Description	Default
cast_model_outputs	Optional[torch.dtype], default: None	Option to ensure that the outputs of your model(s) are always cast to a particular type regardless of opt_level	None
convert_to_sync_batch_norm	bool, default: False	Automatically convert all batch norm calls to apex.parallel.SyncBatchNorm calls
https://nvidia.github.io/apex/parallel.html#apex.parallel.SyncBatchNorm	None
max_loss_scale	float, default: 2.**24	Sets a ceiling for the loss scale values that can be chosen by dynamic loss scaling	None
min_loss_scale	Optional[float], default: None	Sets a floor for the loss scale values that can be chosen by dynamic loss scaling. The default value of None
means that no floor is imposed	value
scaler_per_loss	bool, default: False	Option to impose a scaler for each loss instead of a global scaler	None
verbosity	int, default: 0	Set to 0 to suppress Amp-related output	None

??? example "View Source" class ApexConfig:

        """Nvidia APEX configuration class



        Attributes

        ----------

        cast_model_outputs: Optional[torch.dtype], default: None

            Option to ensure that the outputs of your model(s) are always cast to a particular type regardless of opt_level

        convert_to_sync_batch_norm: bool, default: False

            Automatically convert all batch norm calls to apex.parallel.SyncBatchNorm calls

            https://nvidia.github.io/apex/parallel.html#apex.parallel.SyncBatchNorm

        max_loss_scale: float, default: 2.**24

            Sets a ceiling for the loss scale values that can be chosen by dynamic loss scaling

        min_loss_scale: Optional[float], default: None

            Sets a floor for the loss scale values that can be chosen by dynamic loss scaling. The default value of None

            means that no floor is imposed

        scaler_per_loss: bool, default: False

            Option to impose a scaler for each loss instead of a global scaler

        verbosity: int, default: 0

            Set to 0 to suppress Amp-related output



        """



        cast_model_outputs: Optional[torch.dtype] = None

        convert_to_sync_batch_norm: bool = False

        max_loss_scale: float = 2.0 ** 24

        min_loss_scale: Optional[float] = None

        scaler_per_loss: bool = False

        verbosity: int = 0

BackendOptions

class BackendOptions(
    /,
    *args,
    **kwargs
)

??? example "View Source" class BackendOptions(Enum):

        """Communication backend options"""



        nccl = "nccl"

        mpi = " mpi"

        gloo = "gloo"

Ancestors (in MRO)

enum.Enum

Class variables

gloo

mpi

name

nccl

value

ClipGradConfig

class ClipGradConfig(
    clip_value: float
)

Attributes

Name	Type	Description	Default
clip_value	float	maximum allowed absolute value of the gradients [-clip_value, clip_value]	None

??? example "View Source" class ClipGradConfig:

        """Gradient clipping by value configuration class



        Attributes

        ----------

        clip_value: float

            maximum allowed absolute value of the gradients [-clip_value, clip_value]



        """



        clip_value: float

ClipGradNormConfig

class ClipGradNormConfig(
    max_norm: float,
    norm_type: float
)

Attributes

Name	Type	Description	Default
max_norm	float	max norm of the gradients	None
norm_type	float	type of the used p-norm	None

??? example "View Source" class ClipGradNormConfig:

        """Gradient clipping by p-norm configuration class



        Attributes

        ----------

        max_norm: float

            max norm of the gradients

        norm_type: float

            type of the used p-norm



        """



        max_norm: float

        norm_type: float

DDPConfig

class DDPConfig(
    local_rank: Union[int, NoneType],
    auto_mpi_discovery: bool = False,
    convert_to_sync_batch_norm: bool = False,
    backend: stoke.configs.BackendOptions = 'nccl',
    broadcast_buffers: bool = True,
    bucket_cap_mb: int = 25,
    find_unused_parameters: bool = False,
    gradient_as_bucket_view: bool = False,
    init_method: str = 'env://',
    no_sync: bool = True
)

Attributes

Name	Type	Description	Default
local_rank	Optional[int]	Current local rank of the device (provided here, as LOCAL_RANK env var, or parsed from --local_arg)	None
auto_mpi_discovery	bool, default: False	if distributed environment variables are not set, attempt to discover them from MPI (using underlying deepspeed
function call)	None
convert_to_sync_batch_norm	bool, default: False	Automatically convert all batch norm calls to torch.nn.SyncBatchNorm calls
https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html	None
backend	BackendOptions, default: 'nccl'	Which communication backend to use	None
broadcast_buffers	bool, default: True	Flag that enables syncing (broadcasting) buffers of the module at beginning of the forward function	None
bucket_cap_mb	int, default: 25	DistributedDataParallel will bucket parameters into multiple buckets so that gradient reduction of each bucket
can potentially overlap with backward computation. bucket_cap_mb controls the bucket size in MegaBytes (MB)	None
find_unused_parameters	bool, default: False	Traverse the autograd graph from all tensors contained in the return value of the wrapped module’s forward
function. Parameters that don’t receive gradients as part of this graph are preemptively marked as being ready
to be reduced. Note that all forward outputs that are derived from module parameters must participate in
calculating loss and later the gradient computation. If they don’t, this wrapper will hang waiting for autograd
to produce gradients for those parameters. Any outputs derived from module parameters that are otherwise unused
can be detached from the autograd graph using torch.Tensor.detach	None
gradient_as_bucket_view	bool, default: False	When set to True, gradients will be views pointing to different offsets of allreduce communication
buckets. This can reduce peak memory usage, where the saved memory size will be equal to the total gradients
size. Moreover, it avoids the overhead of copying between gradients and allreduce communication buckets. When
gradients are views, detach_() cannot be called on the gradients. If hitting such errors, please fix it by
referring to the zero_grad() function in torch/optim/optimizer.py as a solution.	None
init_method	str, default: 'env://'	URL specifying how to initialize the process group	None
no_sync	bool, default: True	for any DDP based method (including SDDP and FSDP wrappers -- if activated gradients will be accumulated on
module variables, which will later be synchronized in the first forward-backward pass after exiting the
context. no sync might lead to higher memory usage but lower communication overhead	None

??? example "View Source" class DDPConfig:

        """PyTorch DistributedDataParallel configuration class



        Attributes

        ----------

        local_rank: Optional[int]

            Current local rank of the device (provided here, as LOCAL_RANK env var, or parsed from --local_arg)

        auto_mpi_discovery: bool, default: False

            if distributed environment variables are not set, attempt to discover them from MPI (using underlying deepspeed

            function call)

        convert_to_sync_batch_norm: bool, default: False

            Automatically convert all batch norm calls to torch.nn.SyncBatchNorm calls

            https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html

        backend: BackendOptions, default: 'nccl'

            Which communication backend to use

        broadcast_buffers: bool, default: True

            Flag that enables syncing (broadcasting) buffers of the module at beginning of the forward function

        bucket_cap_mb: int, default: 25

            DistributedDataParallel will bucket parameters into multiple buckets so that gradient reduction of each bucket

            can potentially overlap with backward computation. bucket_cap_mb controls the bucket size in MegaBytes (MB)

        find_unused_parameters: bool, default: False

            Traverse the autograd graph from all tensors contained in the return value of the wrapped module’s forward

            function. Parameters that don’t receive gradients as part of this graph are preemptively marked as being ready

            to be reduced. Note that all forward outputs that are derived from module parameters must participate in

            calculating loss and later the gradient computation. If they don’t, this wrapper will hang waiting for autograd

            to produce gradients for those parameters. Any outputs derived from module parameters that are otherwise unused

            can be detached from the autograd graph using torch.Tensor.detach

        gradient_as_bucket_view: bool, default: False

            When set to True, gradients will be views pointing to different offsets of allreduce communication

            buckets. This can reduce peak memory usage, where the saved memory size will be equal to the total gradients

            size. Moreover, it avoids the overhead of copying between gradients and allreduce communication buckets. When

            gradients are views, detach_() cannot be called on the gradients. If hitting such errors, please fix it by

            referring to the zero_grad() function in torch/optim/optimizer.py as a solution.

        init_method: str, default: 'env://'

            URL specifying how to initialize the process group

        no_sync: bool, default: True

            for any DDP based method (including SDDP and FSDP wrappers -- if activated gradients will be accumulated on

            module variables, which will later be synchronized in the first forward-backward pass after exiting the

            context. no sync might lead to higher memory usage but lower communication overhead



        """



        local_rank: Optional[int]

        auto_mpi_discovery: bool = False

        convert_to_sync_batch_norm: bool = False

        backend: BackendOptions = "nccl"

        broadcast_buffers: bool = True

        bucket_cap_mb: int = 25

        find_unused_parameters: bool = False

        gradient_as_bucket_view: bool = False

        init_method: str = "env://"

        no_sync: bool = True

DeepspeedAIOConfig

class DeepspeedAIOConfig(
    block_size: int = 1048576,
    ignore_unused_parameters: bool = True,
    overlap_events: bool = True,
    queue_depth: int = 8,
    single_submit: bool = False,
    thread_count: int = 1
)

Attributes

Name	Type	Description	Default
block_size	int, default: 1048576	I/O block size in bytes	None
ignore_unused_parameters	bool, default: True	Unused parameters in modules may be unexpected in static networks, but could be normal in dynamic networks.
This controls whether or not training should terminate with an error message when unused parameters are
detected.	None
overlap_events	bool, default: True	Submit requests to storage device in an overlapped fashion without waiting for completion of earlier requests.	None
queue_depth	int, default: 8	I/O queue depth	None
single_submit	bool, default: False	Submit requests to storage device as multiple individual requests as opposed to one block of requests.	None
thread_count	int, default: 1	Intra-request parallelism for each read/write submitted by a user thread.	None

??? example "View Source" class DeepspeedAIOConfig:

        """Deepspeed asynchronous I/O configuration class



        Attributes

        ----------

        block_size: int, default: 1048576

            I/O block size in bytes

        ignore_unused_parameters: bool, default: True

            Unused parameters in modules may be unexpected in static networks, but could be normal in dynamic networks.

            This controls whether or not training should terminate with an error message when unused parameters are

            detected.

        overlap_events: bool, default: True

            Submit requests to storage device in an overlapped fashion without waiting for completion of earlier requests.

        queue_depth: int, default: 8

            I/O queue depth

        single_submit: bool, default: False

            Submit requests to storage device as multiple individual requests as opposed to one block of requests.

        thread_count: int, default: 1

            Intra-request parallelism for each read/write submitted by a user thread.



        """



        block_size: int = 1048576

        ignore_unused_parameters: bool = True

        overlap_events: bool = True

        queue_depth: int = 8

        single_submit: bool = False

        thread_count: int = 1

DeepspeedActivationCheckpointingConfig

class DeepspeedActivationCheckpointingConfig(
    contiguous_memory_optimization: bool = False,
    cpu_checkpointing: bool = False,
    number_checkpoints: Union[int, NoneType] = None,
    partition_activations: bool = False,
    profile: bool = False,
    synchronize_checkpoint_boundary: bool = False
)

Attributes

Name	Type	Description	Default
contiguous_memory_optimization	bool, default: False	Copies partitioned activations so that they are contiguous in memory	None
cpu_checkpointing	bool, default: False	Offloads partitioned activations to CPU if partition_activations is enabled	None
number_checkpoints	Optional[int], default: None	Total number of activation checkpoints used to allocate memory buffer for contiguous_memoty_optimization	None
partition_activations	bool, default: False	Enables partition activation when used with model parallelism	None
profile	bool, default: False	Logs the forward and backward time for each checkpoint function	None
synchronize_checkpoint_boundary	bool, default: False	Inserts torch.cuda.synchronize() at each checkpoint boundary	None

??? example "View Source" class DeepspeedActivationCheckpointingConfig:

        """Deepspeed activation checkpointing configuration class



        Attributes

        ----------

        contiguous_memory_optimization: bool, default: False

            Copies partitioned activations so that they are contiguous in memory

        cpu_checkpointing: bool, default: False

            Offloads partitioned activations to CPU if partition_activations is enabled

        number_checkpoints: Optional[int], default: None

            Total number of activation checkpoints used to allocate memory buffer for contiguous_memoty_optimization

        partition_activations: bool, default: False

            Enables partition activation when used with model parallelism

        profile: bool, default: False

            Logs the forward and backward time for each checkpoint function

        synchronize_checkpoint_boundary: bool, default: False

            Inserts torch.cuda.synchronize() at each checkpoint boundary



        """



        contiguous_memory_optimization: bool = False

        cpu_checkpointing: bool = False

        number_checkpoints: Optional[int] = None

        partition_activations: bool = False

        profile: bool = False

        synchronize_checkpoint_boundary: bool = False

DeepspeedConfig

class DeepspeedConfig(
    activation_checkpointing: Union[stoke.configs.DeepspeedActivationCheckpointingConfig, NoneType] = DeepspeedActivationCheckpointingConfig(contiguous_memory_optimization=False, cpu_checkpointing=False, number_checkpoints=None, partition_activations=False, profile=False, synchronize_checkpoint_boundary=False),
    aio: Union[stoke.configs.DeepspeedAIOConfig, NoneType] = DeepspeedAIOConfig(block_size=1048576, ignore_unused_parameters=True, overlap_events=True, queue_depth=8, single_submit=False, thread_count=1),
    auto_mpi_discovery: bool = True,
    disable_allgather: bool = False,
    dist_backend: stoke.configs.BackendOptions = 'nccl',
    distributed_port: int = 29500,
    dump_state: bool = False,
    flops_profiler: Union[stoke.configs.DeepspeedFlopsConfig, NoneType] = None,
    fp16: Union[stoke.configs.DeepspeedFP16Config, NoneType] = None,
    fp32_allreduce: bool = False,
    gradient_predivide_factor: float = 1.0,
    init_method: str = 'env://',
    prescale_gradients: bool = False,
    progressive_layer_drop: Union[stoke.configs.DeepspeedPLDConfig, NoneType] = None,
    sparse_gradients: bool = False,
    steps_per_print: int = 10,
    tensorboard: Union[stoke.configs.DeepspeedTensorboardConfig, NoneType] = None,
    verbose: bool = True,
    wall_clock_breakdown: bool = False,
    zero_optimization: Union[stoke.configs.DeepspeedZeROConfig, NoneType] = DeepspeedZeROConfig(allgather_bucket_size=500000000, allgather_partitions=True, contiguous_gradients=False, ignore_unused_parameters=True, legacy_stage1=False, offload_optimizer=None, offload_param=None, overlap_comm=False, reduce_bucket_size=500000000, reduce_scatter=True, stage=0, stage3_max_live_parameters=1000000000, stage3_max_reuse_distance=1000000000, stage3_prefetch_bucket_size=500000000, stage3_param_persistence_threshold=1000000, stage3_gather_fp16_weights_on_model_save=False, sub_group_size=1000000000000)
)

Attributes

Name	Type	Description	Default
activation_checkpointing	Optional[DeepspeedActivationCheckpointingConfig], default: DeepspeedActivationCheckpointingConfig()	Enables and configures activation checkpointing	None
aio	Optional[DeepspeedAIOConfig], default: DeepspeedAIOConfig()	Configuring the asynchronous I/O module for offloading parameter and optimizer states to persistent
(NVMe) storage	None
auto_mpi_discovery	bool, default: True	if distributed environment variables are not set, attempt to discover them from MPI	None
disable_allgather	bool, default: False	Disables allgather	None
dist_backend	BackendOptions, default: 'nccl'	Which communication backend to use	None
distributed_port	int, default: 29500	torch distributed backend port	None
dump_state	bool, default: False	Print out state information of DeepSpeed object after initialization	None
flops_profiler	Optional[DeepspeedFlopsConfig], default: None	Enables and configures the flops profiler. This would also enable wall_clock_breakdown	None
fp16	Optional[DeepspeedFP16Config], default: None	Enables and configures mixed precision/FP16 training that leverages NVIDIA’s Apex package	None
fp32_allreduce	bool, default: False	During gradient averaging perform allreduce with 32 bit values	None
gradient_predivide_factor	float, default: 1.0	Before gradient averaging predivide gradients by a specified factor, can sometimes help with fp16 stability
when scaling to large numbers of GPUs	None
init_method	str, default: 'env://'	URL specifying how to initialize the process group	None
prescale_gradients	float, default: 1.0	Scale gradients before doing allreduce	None
progressive_layer_drop	Optional[DeepspeedPLDConfig], default: None	Enables and configures progressive layer dropping	None
sparse_gradients	bool, default: False	Enable sparse compression of torch.nn.Embedding gradients	None
steps_per_print	int, default: 10	Print train loss every N steps	None
tensorboard	Optional[DeepspeedTensorboardConfig], default: None	Enables and configures tensorboard support	None
verbose	bool, default: True	flag to make deepspeed engine verbose with information	None
wall_clock_breakdown	bool, default: False	Enable timing of the latency of forward/backward/update training phases	None
zero_optimization	Optional[DeepspeedZeROConfig], default: DeepspeedZeROConfig()	Enables and configures ZeRO memory optimizations	None

??? example "View Source" class DeepspeedConfig:

        """Deepspeed configuration class



        Composed of other configuration classes related to specific functionality



        Attributes

        ----------

        activation_checkpointing: Optional[DeepspeedActivationCheckpointingConfig], default: DeepspeedActivationCheckpointingConfig()

            Enables and configures activation checkpointing

        aio: Optional[DeepspeedAIOConfig], default: DeepspeedAIOConfig()

            Configuring the asynchronous I/O module for offloading parameter and optimizer states to persistent

            (NVMe) storage

        auto_mpi_discovery: bool, default: True

            if distributed environment variables are not set, attempt to discover them from MPI

        disable_allgather: bool, default: False

            Disables allgather

        dist_backend: BackendOptions, default: 'nccl'

            Which communication backend to use

        distributed_port: int, default: 29500

            torch distributed backend port

        dump_state: bool, default: False

            Print out state information of DeepSpeed object after initialization

        flops_profiler: Optional[DeepspeedFlopsConfig], default: None

            Enables and configures the flops profiler. This would also enable wall_clock_breakdown

        fp16: Optional[DeepspeedFP16Config], default: None

            Enables and configures mixed precision/FP16 training that leverages NVIDIA’s Apex package

        fp32_allreduce: bool, default: False

            During gradient averaging perform allreduce with 32 bit values

        gradient_predivide_factor: float, default: 1.0

            Before gradient averaging predivide gradients by a specified factor, can sometimes help with fp16 stability

            when scaling to large numbers of GPUs

        init_method: str, default: 'env://'

            URL specifying how to initialize the process group

        prescale_gradients: float, default: 1.0

            Scale gradients before doing allreduce

        progressive_layer_drop: Optional[DeepspeedPLDConfig], default: None

            Enables and configures progressive layer dropping

        sparse_gradients: bool, default: False

            Enable sparse compression of torch.nn.Embedding gradients

        steps_per_print: int, default: 10

            Print train loss every N steps

        tensorboard: Optional[DeepspeedTensorboardConfig], default: None

            Enables and configures tensorboard support

        verbose: bool, default: True

            flag to make deepspeed engine verbose with information

        wall_clock_breakdown: bool, default: False

            Enable timing of the latency of forward/backward/update training phases

        zero_optimization: Optional[DeepspeedZeROConfig], default: DeepspeedZeROConfig()

            Enables and configures ZeRO memory optimizations



        Notes

        -----

        Deepspeed does not use Apex’s AMP mode whihc allows for more flexibility in mixed precision training modes. FP16

        here is similar to AMP’s O2 mode



        """



        activation_checkpointing: Optional[

            DeepspeedActivationCheckpointingConfig

        ] = DeepspeedActivationCheckpointingConfig()

        aio: Optional[DeepspeedAIOConfig] = DeepspeedAIOConfig()

        auto_mpi_discovery: bool = True

        disable_allgather: bool = False

        dist_backend: BackendOptions = "nccl"

        distributed_port: int = 29500

        dump_state: bool = False

        flops_profiler: Optional[DeepspeedFlopsConfig] = None

        fp16: Optional[DeepspeedFP16Config] = None

        fp32_allreduce: bool = False

        gradient_predivide_factor: float = 1.0

        init_method: str = "env://"

        prescale_gradients: bool = False

        progressive_layer_drop: Optional[DeepspeedPLDConfig] = None

        sparse_gradients: bool = False

        steps_per_print: int = 10

        tensorboard: Optional[DeepspeedTensorboardConfig] = None

        verbose: bool = True

        wall_clock_breakdown: bool = False

        zero_optimization: Optional[DeepspeedZeROConfig] = DeepspeedZeROConfig()

DeepspeedFP16Config

class DeepspeedFP16Config(
    hysteresis: int = 2,
    initial_scale_power: int = 32,
    loss_scale: float = 0.0,
    loss_scale_window: int = 1000,
    min_loss_scale: int = 1000
)

Attributes

Name	Type	Description	Default
hysteresis	int, default: 2	represents the delay shift in dynamic loss scaling	None
initial_scale_power	int, default: 32	power of the initial dynamic loss scale value. The actual loss scale is computed as 2 ** initial_scale_power	None
loss_scale	float, default: 0.0	loss scaling value for FP16 training (0.0 --> dynamic scaling)	None
loss_scale_window	int, default: 1000	the window over which to raise/lower the dynamic loss scale value	None
min_loss_scale	int, default: 1000	minimum dynamic loss scale value	None

??? example "View Source" class DeepspeedFP16Config:

        """Deepspeed FP16 configuration class



        Attributes

        ----------

        hysteresis: int, default: 2

            represents the delay shift in dynamic loss scaling

        initial_scale_power: int, default: 32

            power of the initial dynamic loss scale value. The actual loss scale is computed as 2 ** initial_scale_power

        loss_scale: float, default: 0.0

            loss scaling value for FP16 training (0.0 --> dynamic scaling)

        loss_scale_window: int, default: 1000

            the window over which to raise/lower the dynamic loss scale value

        min_loss_scale: int, default: 1000

            minimum dynamic loss scale value



        """



        hysteresis: int = 2

        initial_scale_power: int = 32

        loss_scale: float = 0.0

        loss_scale_window: int = 1000

        min_loss_scale: int = 1000

DeepspeedFlopsConfig

class DeepspeedFlopsConfig(
    detailed: bool = True,
    module_depth: int = -1,
    output_file: Union[str, NoneType] = None,
    profile_step: int = 1,
    top_modules: int = 1
)

Attributes

Name	Type	Description	Default
detailed	bool, default: True	Whether to print the detailed model profile	None
module_depth	int, default: -1	The depth of the model at which to print the aggregated module information. When set to -1, it prints
information from the top module to the innermost modules (the maximum depth).	None
output_file	Optional[str], default: None	Path to the output file. If None, the profiler prints to stdout	None
profile_step	int, default: 1	The global training step at which to profile.	None
top_modules	int, default: 1	Limits the aggregated profile output to the number of top modules specified.	None

??? example "View Source" class DeepspeedFlopsConfig:

        """Deepspeed flops profiler configuration class



        Attributes

        ----------

        detailed: bool, default: True

            Whether to print the detailed model profile

        module_depth: int, default: -1

            The depth of the model at which to print the aggregated module information. When set to -1, it prints

            information from the top module to the innermost modules (the maximum depth).

        output_file: Optional[str], default: None

            Path to the output file. If None, the profiler prints to stdout

        profile_step: int, default: 1

            The global training step at which to profile.

        top_modules: int, default: 1

            Limits the aggregated profile output to the number of top modules specified.



        Notes

        -----

        Warm up steps are needed for accurate time measurement



        """



        detailed: bool = True

        module_depth: int = -1

        output_file: Optional[str] = None

        profile_step: int = 1

        top_modules: int = 1

DeepspeedOffloadOptimizerConfig

class DeepspeedOffloadOptimizerConfig(
    buffer_count: int = 4,
    device: stoke.configs.OffloadDevice = 'cpu',
    fast_init: bool = False,
    nvme_path: str = '/local_nvme',
    pin_memory: bool = False,
    pipeline: bool = False,
    pipeline_read: bool = False,
    pipeline_write: bool = False
)

Attributes

Name	Type	Description	Default
buffer_count	int, default: 4	Number of buffers in buffer pool for optimizer state offloading to NVMe. This should be at least the number
of states maintained per parameter by the optimizer. For example, Adam optimizer has 4 states (parameter,
gradient, momentum, and variance).	None
device	OffloadDevice, default: 'cpu'	Device memory to offload optimizer state	None
fast_init	bool, default: False	Enable fast optimizer initialization when offloading to NVMe	None
nvme_path	str, default: '/local_nvme'	Filesystem path for NVMe device for optimizer state offloading	None
pin_memory	bool, default: False	Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead.	None
pipeline	bool, default: False	pipeline activated (will default to True if either pipeline_read or pipeline_write is set	to
pipeline_read	bool, default: False	activate pipeline read (deepspeed has limited docs for what this does)	None
pipeline_write	bool, default: False	activate pipeline write(deepspeed has limited docs for what this does)	None

??? example "View Source" class DeepspeedOffloadOptimizerConfig:

        """Deepspeed optimizer offloading configuration class



        Attributes

        ----------

        buffer_count: int, default: 4

            Number of buffers in buffer pool for optimizer state offloading to NVMe. This should be at least the number

            of states maintained per parameter by the optimizer. For example, Adam optimizer has 4 states (parameter,

            gradient, momentum, and variance).

        device: OffloadDevice, default: 'cpu'

            Device memory to offload optimizer state

        fast_init: bool, default: False

            Enable fast optimizer initialization when offloading to NVMe

        nvme_path: str, default: '/local_nvme'

            Filesystem path for NVMe device for optimizer state offloading

        pin_memory: bool, default: False

            Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead.

        pipeline: bool, default: False

            pipeline activated (will default to True if either pipeline_read or pipeline_write is set

        pipeline_read: bool, default: False

            activate pipeline read (deepspeed has limited docs for what this does)

        pipeline_write: bool, default: False

            activate pipeline write(deepspeed has limited docs for what this does)



        """



        buffer_count: int = 4

        device: OffloadDevice = "cpu"

        fast_init: bool = False

        nvme_path: str = "/local_nvme"

        pin_memory: bool = False

        pipeline: bool = False

        pipeline_read: bool = False

        pipeline_write: bool = False

DeepspeedOffloadParamConfig

class DeepspeedOffloadParamConfig(
    buffer_count: int = 5,
    buffer_size: int = 100000000,
    device: stoke.configs.OffloadDevice = 'cpu',
    max_in_cpu: int = 1000000000,
    nvme_path: str = '/local_nvme',
    pin_memory: bool = False
)

Attributes

Name	Type	Description	Default
buffer_count	int, default: 5	Number of buffers in buffer pool for parameter offloading to NVMe	None
buffer_size	int, default: int(1E8)	Size of buffers in buffer pool for parameter offloading to NVMe	None
device	OffloadDevice, default: 'cpu'	Device memory to offload model parameters	None
max_in_cpu	int, default: int(1E9)	Number of parameter elements to maintain in CPU memory when offloading to NVMe is enabled.	None
nvme_path	str, default: '/local_nvme'	Filesystem path for NVMe device for parameter offloading	None
pin_memory	bool, default: False	Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead.	None

??? example "View Source" class DeepspeedOffloadParamConfig:

        """Deepspeed parameter offloading configuration class



        Attributes

        ----------

        buffer_count: int, default: 5

            Number of buffers in buffer pool for parameter offloading to NVMe

        buffer_size: int, default: int(1E8)

            Size of buffers in buffer pool for parameter offloading to NVMe

        device: OffloadDevice, default: 'cpu'

            Device memory to offload model parameters

        max_in_cpu: int, default: int(1E9)

            Number of parameter elements to maintain in CPU memory when offloading to NVMe is enabled.

        nvme_path: str, default: '/local_nvme'

            Filesystem path for NVMe device for parameter offloading

        pin_memory: bool, default: False

            Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead.



        """



        buffer_count: int = 5

        buffer_size: int = int(1e8)

        device: OffloadDevice = "cpu"

        max_in_cpu: int = int(1e9)

        nvme_path: str = "/local_nvme"

        pin_memory: bool = False

DeepspeedPLDConfig

class DeepspeedPLDConfig(
    theta: float = 1.0,
    gamma: float = 0.001
)

Attributes

Name	Type	Description	Default
theta	float, default: 1.0	Hyper-parameter that controls the trade-off between training time and robustness. The lower the theta value,
the faster the training speed	None
gamma	float, default: 0.001	Hyper-parameter that controls how fast the drop ratio increases	None

??? example "View Source" class DeepspeedPLDConfig:

        """

        Attributes

        ----------

        theta: float, default: 1.0

            Hyper-parameter that controls the trade-off between training time and robustness. The lower the theta value,

            the faster the training speed

        gamma: float, default: 0.001

            Hyper-parameter that controls how fast the drop ratio increases



        """



        theta: float = 1.0

        gamma: float = 0.001

DeepspeedTensorboardConfig

class DeepspeedTensorboardConfig(
    output_path: str = '',
    job_name: str = 'DeepSpeedJobName'
)

Attributes

Name	Type	Description	Default
output_path	str, default: ''	Tensorboard output path	None
job_name	str, default: 'DeepSpeedJobName'	Tensorboard job name	None

??? example "View Source" class DeepspeedTensorboardConfig:

        """Deepspeed Tensorboard configuration class



        Attributes

        ----------

        output_path: str, default: ''

            Tensorboard output path

        job_name: str, default: 'DeepSpeedJobName'

            Tensorboard job name



        """



        output_path: str = ""

        job_name: str = "DeepSpeedJobName"

DeepspeedZeROConfig

class DeepspeedZeROConfig(
    allgather_bucket_size: int = 500000000,
    allgather_partitions: bool = True,
    contiguous_gradients: bool = False,
    ignore_unused_parameters: bool = True,
    legacy_stage1: bool = False,
    offload_optimizer: Union[stoke.configs.DeepspeedOffloadOptimizerConfig, NoneType] = None,
    offload_param: Union[stoke.configs.DeepspeedOffloadParamConfig, NoneType] = None,
    overlap_comm: bool = False,
    reduce_bucket_size: int = 500000000,
    reduce_scatter: bool = True,
    stage: int = 0,
    stage3_max_live_parameters: int = 1000000000,
    stage3_max_reuse_distance: int = 1000000000,
    stage3_prefetch_bucket_size: int = 500000000,
    stage3_param_persistence_threshold: int = 1000000,
    stage3_gather_fp16_weights_on_model_save: bool = False,
    sub_group_size: int = 1000000000000
)

Attributes

Name	Type	Description	Default
allgather_bucket_size	int, default: int(5E8)	Number of elements allgathered at a time. Limits the memory required for the allgather for large model sizes	None
allgather_partitions	bool, default: True	Chooses between allgather collective or a series of broadcast collectives to gather updated parameters
from all the GPUs at the end of each step	None
contiguous_gradients	bool, default: False	Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward
pass. Only useful when running very large models.	None
ignore_unused_parameters	bool, default: True	Now just used in stage2 complete_grad_norm_calculation_for_cpu_offload
Enable this option to avoid -- https://github.com/microsoft/DeepSpeed/issues/707	None
legacy_stage1	bool, default: False	Use deepspeed < v0.3.17 zero stage 1, kept for backwards compatability reasons	None
offload_optimizer	Optional[DeepspeedOffloadOptimizerConfig], default: None	Enable offloading of optimizer state to CPU or NVMe, and optimizer computation to CPU. This frees up GPU
memory for larger models or batch sizes. Valid only with stage 3	None
offload_param	Optional[DeepspeedOffloadParamConfig], default: None	Enable offloading of model parameters to CPU or NVMe. This frees up GPU memory for larger models or batch
sizes. Valid only with stage 3.	None
overlap_comm	bool, default: False	Attempts to overlap the reduction of the gradients with backward computation	None
reduce_bucket_size	int, default: int(5E8)	Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large
model sizes	None
reduce_scatter	bool, default: True	Uses reduce or reduce scatter instead of allreduce to average gradients	None
stage	int, default: 0	Chooses different stages of ZeRO Optimizer. Stage 0, 1, 2, and 3 refer to disabled, optimizer state
partitioning, and optimizer+gradient state partitioning, and optimizer+gradient+parameter partitioning,
respectively	None
stage3_max_live_parameters	int, default: int(1E9)	The maximum number of parameters resident per GPU before releasing. Smaller values use less memory, but
perform more communication.	None
stage3_max_reuse_distance	int, default: int(1E9)	Do not release a parameter if it will be reused within this threshold of parameters. Smaller values use less
memory, but perform more communication.	None
stage3_prefetch_bucket_size	int, default: int(5E8)	The size of the fixed buffer for prefetching parameters. Smaller values use less memory, but can increase
stalls due to communication.	None
stage3_param_persistence_threshold	int, default: int(1E6)	Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly
increase communication (especially latency-bound messages).	None
stage3_gather_fp16_weights_on_model_save	bool, default: False	Consolidate the weights before saving the model by save_fp16_model(). Since the weights are partitioned
across GPUs, they aren’t part of state_dict, so this function automatically gather the weights when this
option is enabled and then saves the fp16 model weights.	None
sub_group_size	int, default: int(1E12)	sub_group_size controls the granularity in which parameters are updated during optimizer steps. Parameters are
grouped into buckets of sub_group_size and each buckets is updated one at a time.	None

??? example "View Source" class DeepspeedZeROConfig:

        """Deepspeed ZeRO configuration class



        Attributes

        ----------

        allgather_bucket_size: int, default: int(5E8)

            Number of elements allgathered at a time. Limits the memory required for the allgather for large model sizes

        allgather_partitions: bool, default: True

            Chooses between allgather collective or a series of broadcast collectives to gather updated parameters

            from all the GPUs at the end of each step

        contiguous_gradients: bool, default: False

            Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward

            pass. Only useful when running very large models.

        ignore_unused_parameters: bool, default: True

            Now just used in stage2 complete_grad_norm_calculation_for_cpu_offload

            Enable this option to avoid -- https://github.com/microsoft/DeepSpeed/issues/707

        legacy_stage1: bool, default: False

            Use deepspeed < v0.3.17 zero stage 1, kept for backwards compatability reasons

        offload_optimizer: Optional[DeepspeedOffloadOptimizerConfig], default: None

            Enable offloading of optimizer state to CPU or NVMe, and optimizer computation to CPU. This frees up GPU

            memory for larger models or batch sizes. Valid only with stage 3

        offload_param: Optional[DeepspeedOffloadParamConfig], default: None

            Enable offloading of model parameters to CPU or NVMe. This frees up GPU memory for larger models or batch

            sizes. Valid only with stage 3.

        overlap_comm: bool, default: False

            Attempts to overlap the reduction of the gradients with backward computation

        reduce_bucket_size: int, default: int(5E8)

            Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large

            model sizes

        reduce_scatter: bool, default: True

            Uses reduce or reduce scatter instead of allreduce to average gradients

        stage: int, default: 0

            Chooses different stages of ZeRO Optimizer. Stage 0, 1, 2, and 3 refer to disabled, optimizer state

            partitioning, and optimizer+gradient state partitioning, and optimizer+gradient+parameter partitioning,

            respectively

        stage3_max_live_parameters: int, default: int(1E9)

            The maximum number of parameters resident per GPU before releasing. Smaller values use less memory, but

            perform more communication.

        stage3_max_reuse_distance: int, default: int(1E9)

            Do not release a parameter if it will be reused within this threshold of parameters. Smaller values use less

            memory, but perform more communication.

        stage3_prefetch_bucket_size: int, default: int(5E8)

            The size of the fixed buffer for prefetching parameters. Smaller values use less memory, but can increase

            stalls due to communication.

        stage3_param_persistence_threshold: int, default: int(1E6)

            Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly

            increase communication (especially latency-bound messages).

        stage3_gather_fp16_weights_on_model_save: bool, default: False

            Consolidate the weights before saving the model by save_fp16_model(). Since the weights are partitioned

            across GPUs, they aren’t part of state_dict, so this function automatically gather the weights when this

            option is enabled and then saves the fp16 model weights.

        sub_group_size: int, default: int(1E12)

            sub_group_size controls the granularity in which parameters are updated during optimizer steps. Parameters are

            grouped into buckets of sub_group_size and each buckets is updated one at a time.



        """



        allgather_bucket_size: int = int(5e8)

        allgather_partitions: bool = True

        contiguous_gradients: bool = False

        ignore_unused_parameters: bool = True

        legacy_stage1: bool = False

        offload_optimizer: Optional[DeepspeedOffloadOptimizerConfig] = None

        offload_param: Optional[DeepspeedOffloadParamConfig] = None

        overlap_comm: bool = False

        reduce_bucket_size: int = int(5e8)

        reduce_scatter: bool = True

        stage: int = 0

        stage3_max_live_parameters: int = int(1e9)

        stage3_max_reuse_distance: int = int(1e9)

        stage3_prefetch_bucket_size: int = int(5e8)

        stage3_param_persistence_threshold: int = int(1e6)

        stage3_gather_fp16_weights_on_model_save: bool = False

        sub_group_size: int = int(1e12)

FairscaleFSDPConfig

class FairscaleFSDPConfig(
    bucket_cap_mb: int = 25,
    buffer_dtype: Union[torch.dtype, NoneType] = None,
    clear_autocast_cache: bool = False,
    compute_dtype: Union[torch.dtype, NoneType] = None,
    flatten_parameters: bool = True,
    force_input_to_fp32: bool = False,
    fp32_reduce_scatter: bool = False,
    gradient_predivide_factor: Union[float, NoneType] = None,
    gradient_postdivide_factor: Union[float, NoneType] = None,
    move_grads_to_cpu: Union[bool, NoneType] = None,
    move_params_to_cpu: bool = False,
    no_broadcast_optim_state: Union[bool, NoneType] = False,
    reshard_after_forward: bool = True,
    verbose: bool = False
)

Attributes

Name	Type	Description	Default
bucket_cap_mb	int, default: 25	FSDP will bucket parameters so that gradient reduction can be more efficient for small parameters.
bucket_cap_mb controls the bucket size in MegaBytes (MB). Buckets are sub-divided based on world_size, so the
max shard size is roughly bucket_cap_mb / world_size. There is one bucketer (with potentially multiple
bucket_cap_mb sized buffers shared by all FSDP instances. Large gradient tensors are directly reduced without
using the buffers. The buffers are there to reduce communication overhead for small tensors. Overlapping with
computation happens due to use of a different CUDA stream than the computation CUDA stream. The total memory
overhead per buffer is around bucket_cap_mb / world_size * (world_size + 1). The buffers are allocated during
the backward pass and freed at the end of the backward pass to save more memory for other phases of the
training process. Note, the memory vs. speed tradeoff of bucket size is very different from that of the DDP
engine. In DDP, the buffer size 1MB + n*cap_mb, until n is big enough to cover the entire model size. The
order of which buffer is ready there is more rigid and DDP requires all gradients to be computed in the
backward. In FSDP, the buffer size does not change with model size (it changes based on number of
tuples) and gradient ready order matters little since FSDP has a final flush
call that ensures everything is reduced and not all gradients need to be upfront known. Overlapping with
compute is done differently too. Values <= 0 disable bucketing	None
buffer_dtype	Optional[torch.dtype], default: None	dtype for buffers for computation. defaults to value of compute_dtype	value
clear_autocast_cache	bool, default: False	When using mixed precision training with FP16 AMP, if the model weights are in FP32, autocast
maintains a cache for downcasted weights. The cache can cause GPU OOM during the forward pass. Setting this
flag to true will help clearing this cache as inner FSDP instances finish part of the forward pass to save
GPU memory	None
compute_dtype	Optional[torch.dtype], default: None	dtype for full parameters for computation. This defaults to torch.float32 unless FP 16 AMP is set,
in which case it defaults to torch.float16.	torch.float32
flatten_parameters	bool, default: True	flatten parameters into a single contiguous tensor, which improves training speed	None
force_input_to_fp32	bool, default: False:	force input floating point tensors to be FP32 (if they are FP16) when the FSDP instance is in full precision
mode. This helps avoid issues of running SyncBatchNorm with AMP and checkpoint_wrapper.	None
fp32_reduce_scatter	bool, default: False	reduce-scatter gradients in FP32. This is only relevant when FP16 AMP is used	None
gradient_predivide_factor	Optional[float], default: None	divide factor before the reduction	None
gradient_postdivide_factor	Optional[float], default: None	divide factor after the reduction	None
move_grads_to_cpu	Optional[bool], default: None	move gradient shard to CPU after reduction. This is only relevant when FP16 AMP is used	None
move_params_to_cpu	bool, default: False	offload FP32 params to CPU. This is only relevant when FP16 AMP is used	None
no_broadcast_optim_state	Optional[bool], default: False	do not broadcast this modules optimizer state when gather_full_optim_state_dict is called. If you set this
true, you are expected to overwrite the relevant state entries of the returned optimizer state dict with the
proper state at each rank. This is useful for situations, like Mixture Of Experts, where all but a few
parameters can fit on one node	None
reshard_after_forward	bool, default: True	reshard parameters after the forward pass. This saves memory but slows training. This is only relevant
when resharding individual layers (see https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html)	None
verbose	bool, default: True	turn on verbose output for model’s string representation	None

??? example "View Source" class FairscaleFSDPConfig:

        """Fairscale Fully Sharded Data Parallel configuration class



        Attributes

        ----------

        bucket_cap_mb: int, default: 25

            FSDP will bucket parameters so that gradient reduction can be more efficient for small parameters.

            bucket_cap_mb controls the bucket size in MegaBytes (MB). Buckets are sub-divided based on world_size, so the

            max shard size is roughly bucket_cap_mb / world_size. There is one bucketer (with potentially multiple

            bucket_cap_mb sized buffers shared by all FSDP instances. Large gradient tensors are directly reduced without

            using the buffers. The buffers are there to reduce communication overhead for small tensors. Overlapping with

            computation happens due to use of a different CUDA stream than the computation CUDA stream. The total memory

            overhead per buffer is around bucket_cap_mb / world_size * (world_size + 1). The buffers are allocated during

            the backward pass and freed at the end of the backward pass to save more memory for other phases of the

            training process. Note, the memory vs. speed tradeoff of bucket size is very different from that of the DDP

            engine. In DDP, the buffer size 1MB + n*cap_mb, until n is big enough to cover the entire model size. The

            order of which buffer is ready there is more rigid and DDP requires all gradients to be computed in the

            backward. In FSDP, the buffer size does not change with model size (it changes based on number of

            <dtype, device, process_group> tuples) and gradient ready order matters little since FSDP has a final flush

            call that ensures everything is reduced and not all gradients need to be upfront known. Overlapping with

            compute is done differently too. Values <= 0 disable bucketing

        buffer_dtype: Optional[torch.dtype], default: None

            dtype for buffers for computation. defaults to value of compute_dtype

        clear_autocast_cache: bool, default: False

            When using mixed precision training with FP16 AMP, if the model weights are in FP32, autocast

            maintains a cache for downcasted weights. The cache can cause GPU OOM during the forward pass. Setting this

            flag to true will help clearing this cache as inner FSDP instances finish part of the forward pass to save

            GPU memory

        compute_dtype: Optional[torch.dtype], default: None

            dtype for full parameters for computation. This defaults to torch.float32 unless FP 16 AMP is set,

            in which case it defaults to torch.float16.

        flatten_parameters: bool, default: True

            flatten parameters into a single contiguous tensor, which improves training speed

        force_input_to_fp32: bool, default: False:

            force input floating point tensors to be FP32 (if they are FP16) when the FSDP instance is in full precision

            mode. This helps avoid issues of running SyncBatchNorm with AMP and checkpoint_wrapper.

        fp32_reduce_scatter: bool, default: False

            reduce-scatter gradients in FP32. This is only relevant when FP16 AMP is used

        gradient_predivide_factor: Optional[float], default: None

            divide factor before the reduction

        gradient_postdivide_factor: Optional[float], default: None

            divide factor after the reduction

        move_grads_to_cpu: Optional[bool], default: None

            move gradient shard to CPU after reduction. This is only relevant when FP16 AMP is used

        move_params_to_cpu: bool, default: False

            offload FP32 params to CPU. This is only relevant when FP16 AMP is used

        no_broadcast_optim_state: Optional[bool], default: False

            do not broadcast this modules optimizer state when gather_full_optim_state_dict is called. If you set this

            true, you are expected to overwrite the relevant state entries of the returned optimizer state dict with the

            proper state at each rank. This is useful for situations, like Mixture Of Experts, where all but a few

            parameters can fit on one node

        reshard_after_forward: bool, default: True

            reshard parameters after the forward pass. This saves memory but slows training. This is only relevant

            when resharding individual layers (see https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html)

        verbose: bool, default: True

            turn on verbose output for model’s string representation



        Notes

        -----

        mixed_precision: bool

            This value will automatically be set from the Stoke FP16 selected option (AMP only)

        state_dict_device: torch.device

            this is not exposed as it should be managed internally from the DDP backend setup

        compute_device: torch.device

            this is not exposed as it should be managed internally from the DDP backend setup



        """



        bucket_cap_mb: int = 25

        buffer_dtype: Optional[torch.dtype] = None

        clear_autocast_cache: bool = False

        compute_dtype: Optional[torch.dtype] = None

        flatten_parameters: bool = True

        force_input_to_fp32: bool = False

        fp32_reduce_scatter: bool = False

        gradient_predivide_factor: Optional[float] = None

        gradient_postdivide_factor: Optional[float] = None

        move_grads_to_cpu: Optional[bool] = None

        move_params_to_cpu: bool = False

        no_broadcast_optim_state: Optional[bool] = False

        reshard_after_forward: bool = True

        verbose: bool = False

Descendants

stoke.extensions._FairscaleFSDPConfig

FairscaleOSSConfig

class FairscaleOSSConfig(
    broadcast_fp16: bool = False
)

Attributes

Name	Type	Description	Default
broadcast_fp16	bool, default: False	Compress the model shards in fp16 before sharing them in between ranks. This is safe to use when PyTorch AMP
is activated. Without torch AMP this will lead to a slight degradation in terms of accuracy.	None

??? example "View Source" class FairscaleOSSConfig:

        """Fairscale optimizer state sharding configuration class



        Attributes

        ----------

        broadcast_fp16: bool, default: False

            Compress the model shards in fp16 before sharing them in between ranks. This is safe to use when PyTorch AMP

            is activated. Without torch AMP this will lead to a slight degradation in terms of accuracy.



        """



        broadcast_fp16: bool = False

FairscaleSDDPConfig

class FairscaleSDDPConfig(
    auto_refresh_trainable: bool = True,
    broadcast_buffers: bool = True,
    reduce_buffer_size: int = 8388608,
    reduce_fp16: bool = False,
    sync_models_at_startup: bool = True
)

Attributes

Name	Type	Description
auto_refresh_trainable	bool, default: True	Check whether the parameters trainability (requires_grad) has changed and update both ShardedDDP and OSS
automatically if this is the case. If set to False, refresh_trainable() needs to be called anytime a
parameter is frozen or unfrozen	None
broadcast_buffers	bool, default: True	Whether to additionally broadcast model buffers in between ranks at the beginning of each forward pass. Same
setting as in Pytorch DDP, this is in addition to the broadcast and reduction of the model parameters.	None
reduce_buffer_size	int, default: 2 ** 23	he max size of the buffer used to batch the small parameter tensors, in number of elements. This will impact
the long term memory consumption, because these buckets correspond to parameters which will not be sharded.
Set to 0 to remove all bucketing, 1M to 8M is usually reasonable.	None
reduce_fp16	bool, default: False	cast the grads to fp16 before reducing. Not needed if the model is already fp16, but will probably improve
performance for multi node jobs using PyTorch AMP. The effect is similar to DDP’s fp16_compress_hook and
will also save some memory.	None
sync_models_at_startup	bool, default: True	Synchronize the models in between the ranks when starting up. Not needed if each rank has the same seed, or
the training restarts from a saved state	None

??? example "View Source" class FairscaleSDDPConfig:

        """Fairscale sharded data parallel (SDDP) configuration class



        Attributes

        ----------

        auto_refresh_trainable: bool, default: True

            Check whether the parameters trainability (requires_grad) has changed and update both ShardedDDP and OSS

            automatically if this is the case. If set to False, refresh_trainable() needs to be called anytime a

            parameter is frozen or unfrozen

        broadcast_buffers: bool, default: True

            Whether to additionally broadcast model buffers in between ranks at the beginning of each forward pass. Same

            setting as in Pytorch DDP, this is in addition to the broadcast and reduction of the model parameters.

        reduce_buffer_size: int, default: 2 ** 23

            he max size of the buffer used to batch the small parameter tensors, in number of elements. This will impact

            the long term memory consumption, because these buckets correspond to parameters which will not be sharded.

            Set to 0 to remove all bucketing, 1M to 8M is usually reasonable.

        reduce_fp16: bool, default: False

            cast the grads to fp16 before reducing. Not needed if the model is already fp16, but will probably improve

            performance for multi node jobs using PyTorch AMP. The effect is similar to DDP’s fp16_compress_hook and

            will also save some memory.

        sync_models_at_startup: bool, default: True

            Synchronize the models in between the ranks when starting up. Not needed if each rank has the same seed, or

            the training restarts from a saved state



        """



        auto_refresh_trainable: bool = True

        broadcast_buffers: bool = True

        reduce_buffer_size: int = 2 ** 23

        reduce_fp16: bool = False

        sync_models_at_startup: bool = True

HorovodConfig

class HorovodConfig(
    compression: bool = False,
    convert_to_sync_batch_norm: bool = False,
    gradient_predivide_factor: float = 1.0,
    op: stoke.configs.HorovodOps = 'Average'
)

Attributes

Name	Type	Description	Default
compression	bool, default: False	Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter
update step.	None
convert_to_sync_batch_norm	bool, default: False	Automatically convert all batch norm calls to horovod.torch.SyncBatchNorm calls
https://horovod.readthedocs.io/en/stable/api.html#horovod.torch.SyncBatchNorm	None
gradient_predivide_factor	float, default: 1.0	If op == Average, gradient_predivide_factor splits the averaging before and after the sum. Gradients are scaled
by 1.0 / gradient_predivide_factor before the sum and gradient_predivide_factor / size after the sum.	None
op	HorovodOps, default: 'Average'	The reduction operation to use when combining gradients across different ranks.	None

??? example "View Source" class HorovodConfig:

        """Horovod configuration class



        Attributes

        ----------

        compression: bool, default: False

            Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter

            update step.

        convert_to_sync_batch_norm: bool, default: False

            Automatically convert all batch norm calls to horovod.torch.SyncBatchNorm calls

            https://horovod.readthedocs.io/en/stable/api.html#horovod.torch.SyncBatchNorm

        gradient_predivide_factor: float, default: 1.0

            If op == Average, gradient_predivide_factor splits the averaging before and after the sum. Gradients are scaled

            by 1.0 / gradient_predivide_factor before the sum and gradient_predivide_factor / size after the sum.

        op: HorovodOps, default: 'Average'

            The reduction operation to use when combining gradients across different ranks.



        """



        compression: bool = False

        convert_to_sync_batch_norm: bool = False

        gradient_predivide_factor: float = 1.0

        op: HorovodOps = "Average"

HorovodOps

class HorovodOps(
    /,
    *args,
    **kwargs
)

??? example "View Source" class HorovodOps(Enum):

        """Horovod ops options"""



        Average = "Average"

        Sum = "Sum"

        Adasum = "Adasum"

Ancestors (in MRO)

enum.Enum

Class variables

Adasum

Average

Sum

name

value

OffloadDevice

class OffloadDevice(
    /,
    *args,
    **kwargs
)

??? example "View Source" class OffloadDevice(Enum):

        """Offload device options"""



        none = "none"

        cpu = "cpu"

        nvme = "nvme"

Ancestors (in MRO)

enum.Enum

Class variables

cpu

name

none

nvme

value

StokeOptimizer

class StokeOptimizer(
    /,
    *args,
    **kwargs
)

Attributes

Name	Type	Description	Default
optimizer	Type[torch.optim.Optimizer]	un-instantiated torch.optim.Optimizer class	None
optimizer_kwargs	Dict	any keyword args to be unrolled into the optimizer at instantiation time	None

??? example "View Source" class StokeOptimizer(TypedDict):

        """Stoke optimizer wrapper class



        Given all the different backends and extensions the optimizer might need to be instantiated in a different way

        thus this typed dict holds the configuration without instantiation



        Attributes

        ----------

        optimizer: Type[torch.optim.Optimizer]

            un-instantiated torch.optim.Optimizer class

        optimizer_kwargs: Dict

            any keyword args to be unrolled into the optimizer at instantiation time



        """



        optimizer: Type[torch.optim.Optimizer]

        optimizer_kwargs: Dict

Ancestors (in MRO)

builtins.dict

Methods

clear

def clear(
    ...
)

D.clear() -> None. Remove all items from D.

copy

def copy(
    ...
)

D.copy() -> a shallow copy of D

fromkeys

def fromkeys(
    iterable,
    value=None,
    /
)

Create a new dictionary with keys from iterable and values set to value.

get

def get(
    self,
    key,
    default=None,
    /
)

Return the value for key if key is in the dictionary, else default.

items

def items(
    ...
)

D.items() -> a set-like object providing a view on D's items

keys

def keys(
    ...
)

D.keys() -> a set-like object providing a view on D's keys

pop

def pop(
    ...
)

D.pop(k[,d]) -> v, remove specified key and return the corresponding value.

If key is not found, d is returned if given, otherwise KeyError is raised

popitem

def popitem(
    self,
    /
)

Remove and return a (key, value) pair as a 2-tuple.

Pairs are returned in LIFO (last-in, first-out) order. Raises KeyError if the dict is empty.

setdefault

def setdefault(
    self,
    key,
    default=None,
    /
)

Insert key with a value of default if key is not in the dictionary.

Return the value for key if key is in the dictionary, else default.

update

def update(
    ...
)

D.update([E, ]**F) -> None. Update D from dict/iterable E and F.

If E is present and has a .keys() method, then does: for k in E: D[k] = E[k] If E is present and lacks a .keys() method, then does: for k, v in E: D[k] = v In either case, this is followed by: for k in F: D[k] = F[k]

values

def values(
    ...
)

D.values() -> an object providing a view on D's values