Module stoke.configs
Handles all config objects
None
??? example "View Source" # -- coding: utf-8 --
# Copyright FMR LLC <opensource@fidelity.com>
# SPDX-License-Identifier: Apache-2.0
"""Handles all config objects"""
from enum import Enum
from typing import Dict, Optional, Type
import attr
import torch
try:
from typing import TypedDict
except ImportError:
from mypy_extensions import TypedDict
class HorovodOps(Enum):
"""Horovod ops options"""
Average = "Average"
Sum = "Sum"
Adasum = "Adasum"
class OffloadDevice(Enum):
"""Offload device options"""
none = "none"
cpu = "cpu"
nvme = "nvme"
class BackendOptions(Enum):
"""Communication backend options"""
nccl = "nccl"
mpi = " mpi"
gloo = "gloo"
@attr.s(auto_attribs=True)
class AMPConfig:
"""PyTorch AMP configuration class
Attributes
----------
backoff_factor : float, default: 0.5
Factor by which the scale is multiplied during update if inf/NaN gradients occur in an iteration
growth_factor : float, default: 2.0
Factor by which the scale is multiplied during update if no inf/NaN gradients occur for growth_interval consecutive iterations.
growth_interval : int, default: 2000
Number of consecutive iterations without inf/NaN gradients that must occur for the scale to be multiplied by
growth_factor
init_scale : float, default: 2.**16
Initial scale factor
"""
backoff_factor: float = 0.5
growth_factor: float = 2.0
growth_interval: int = 2000
init_scale: float = 2.0 ** 16
@attr.s(auto_attribs=True)
class ApexConfig:
"""Nvidia APEX configuration class
Attributes
----------
cast_model_outputs: Optional[torch.dtype], default: None
Option to ensure that the outputs of your model(s) are always cast to a particular type regardless of opt_level
convert_to_sync_batch_norm: bool, default: False
Automatically convert all batch norm calls to apex.parallel.SyncBatchNorm calls
https://nvidia.github.io/apex/parallel.html#apex.parallel.SyncBatchNorm
max_loss_scale: float, default: 2.**24
Sets a ceiling for the loss scale values that can be chosen by dynamic loss scaling
min_loss_scale: Optional[float], default: None
Sets a floor for the loss scale values that can be chosen by dynamic loss scaling. The default value of None
means that no floor is imposed
scaler_per_loss: bool, default: False
Option to impose a scaler for each loss instead of a global scaler
verbosity: int, default: 0
Set to 0 to suppress Amp-related output
"""
cast_model_outputs: Optional[torch.dtype] = None
convert_to_sync_batch_norm: bool = False
max_loss_scale: float = 2.0 ** 24
min_loss_scale: Optional[float] = None
scaler_per_loss: bool = False
verbosity: int = 0
@attr.s(auto_attribs=True)
class ClipGradConfig:
"""Gradient clipping by value configuration class
Attributes
----------
clip_value: float
maximum allowed absolute value of the gradients [-clip_value, clip_value]
"""
clip_value: float
@attr.s(auto_attribs=True)
class ClipGradNormConfig:
"""Gradient clipping by p-norm configuration class
Attributes
----------
max_norm: float
max norm of the gradients
norm_type: float
type of the used p-norm
"""
max_norm: float
norm_type: float
@attr.s(auto_attribs=True)
class DDPConfig:
"""PyTorch DistributedDataParallel configuration class
Attributes
----------
local_rank: Optional[int]
Current local rank of the device (provided here, as LOCAL_RANK env var, or parsed from --local_arg)
auto_mpi_discovery: bool, default: False
if distributed environment variables are not set, attempt to discover them from MPI (using underlying deepspeed
function call)
convert_to_sync_batch_norm: bool, default: False
Automatically convert all batch norm calls to torch.nn.SyncBatchNorm calls
https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html
backend: BackendOptions, default: 'nccl'
Which communication backend to use
broadcast_buffers: bool, default: True
Flag that enables syncing (broadcasting) buffers of the module at beginning of the forward function
bucket_cap_mb: int, default: 25
DistributedDataParallel will bucket parameters into multiple buckets so that gradient reduction of each bucket
can potentially overlap with backward computation. bucket_cap_mb controls the bucket size in MegaBytes (MB)
find_unused_parameters: bool, default: False
Traverse the autograd graph from all tensors contained in the return value of the wrapped module’s forward
function. Parameters that don’t receive gradients as part of this graph are preemptively marked as being ready
to be reduced. Note that all forward outputs that are derived from module parameters must participate in
calculating loss and later the gradient computation. If they don’t, this wrapper will hang waiting for autograd
to produce gradients for those parameters. Any outputs derived from module parameters that are otherwise unused
can be detached from the autograd graph using torch.Tensor.detach
gradient_as_bucket_view: bool, default: False
When set to True, gradients will be views pointing to different offsets of allreduce communication
buckets. This can reduce peak memory usage, where the saved memory size will be equal to the total gradients
size. Moreover, it avoids the overhead of copying between gradients and allreduce communication buckets. When
gradients are views, detach_() cannot be called on the gradients. If hitting such errors, please fix it by
referring to the zero_grad() function in torch/optim/optimizer.py as a solution.
init_method: str, default: 'env://'
URL specifying how to initialize the process group
no_sync: bool, default: True
for any DDP based method (including SDDP and FSDP wrappers -- if activated gradients will be accumulated on
module variables, which will later be synchronized in the first forward-backward pass after exiting the
context. no sync might lead to higher memory usage but lower communication overhead
"""
local_rank: Optional[int]
auto_mpi_discovery: bool = False
convert_to_sync_batch_norm: bool = False
backend: BackendOptions = "nccl"
broadcast_buffers: bool = True
bucket_cap_mb: int = 25
find_unused_parameters: bool = False
gradient_as_bucket_view: bool = False
init_method: str = "env://"
no_sync: bool = True
@attr.s(auto_attribs=True)
class DeepspeedAIOConfig:
"""Deepspeed asynchronous I/O configuration class
Attributes
----------
block_size: int, default: 1048576
I/O block size in bytes
ignore_unused_parameters: bool, default: True
Unused parameters in modules may be unexpected in static networks, but could be normal in dynamic networks.
This controls whether or not training should terminate with an error message when unused parameters are
detected.
overlap_events: bool, default: True
Submit requests to storage device in an overlapped fashion without waiting for completion of earlier requests.
queue_depth: int, default: 8
I/O queue depth
single_submit: bool, default: False
Submit requests to storage device as multiple individual requests as opposed to one block of requests.
thread_count: int, default: 1
Intra-request parallelism for each read/write submitted by a user thread.
"""
block_size: int = 1048576
ignore_unused_parameters: bool = True
overlap_events: bool = True
queue_depth: int = 8
single_submit: bool = False
thread_count: int = 1
@attr.s(auto_attribs=True)
class DeepspeedActivationCheckpointingConfig:
"""Deepspeed activation checkpointing configuration class
Attributes
----------
contiguous_memory_optimization: bool, default: False
Copies partitioned activations so that they are contiguous in memory
cpu_checkpointing: bool, default: False
Offloads partitioned activations to CPU if partition_activations is enabled
number_checkpoints: Optional[int], default: None
Total number of activation checkpoints used to allocate memory buffer for contiguous_memoty_optimization
partition_activations: bool, default: False
Enables partition activation when used with model parallelism
profile: bool, default: False
Logs the forward and backward time for each checkpoint function
synchronize_checkpoint_boundary: bool, default: False
Inserts torch.cuda.synchronize() at each checkpoint boundary
"""
contiguous_memory_optimization: bool = False
cpu_checkpointing: bool = False
number_checkpoints: Optional[int] = None
partition_activations: bool = False
profile: bool = False
synchronize_checkpoint_boundary: bool = False
@attr.s(auto_attribs=True)
class DeepspeedFlopsConfig:
"""Deepspeed flops profiler configuration class
Attributes
----------
detailed: bool, default: True
Whether to print the detailed model profile
module_depth: int, default: -1
The depth of the model at which to print the aggregated module information. When set to -1, it prints
information from the top module to the innermost modules (the maximum depth).
output_file: Optional[str], default: None
Path to the output file. If None, the profiler prints to stdout
profile_step: int, default: 1
The global training step at which to profile.
top_modules: int, default: 1
Limits the aggregated profile output to the number of top modules specified.
Notes
-----
Warm up steps are needed for accurate time measurement
"""
detailed: bool = True
module_depth: int = -1
output_file: Optional[str] = None
profile_step: int = 1
top_modules: int = 1
@attr.s(auto_attribs=True)
class DeepspeedFP16Config:
"""Deepspeed FP16 configuration class
Attributes
----------
hysteresis: int, default: 2
represents the delay shift in dynamic loss scaling
initial_scale_power: int, default: 32
power of the initial dynamic loss scale value. The actual loss scale is computed as 2 ** initial_scale_power
loss_scale: float, default: 0.0
loss scaling value for FP16 training (0.0 --> dynamic scaling)
loss_scale_window: int, default: 1000
the window over which to raise/lower the dynamic loss scale value
min_loss_scale: int, default: 1000
minimum dynamic loss scale value
"""
hysteresis: int = 2
initial_scale_power: int = 32
loss_scale: float = 0.0
loss_scale_window: int = 1000
min_loss_scale: int = 1000
@attr.s(auto_attribs=True)
class DeepspeedOffloadOptimizerConfig:
"""Deepspeed optimizer offloading configuration class
Attributes
----------
buffer_count: int, default: 4
Number of buffers in buffer pool for optimizer state offloading to NVMe. This should be at least the number
of states maintained per parameter by the optimizer. For example, Adam optimizer has 4 states (parameter,
gradient, momentum, and variance).
device: OffloadDevice, default: 'cpu'
Device memory to offload optimizer state
fast_init: bool, default: False
Enable fast optimizer initialization when offloading to NVMe
nvme_path: str, default: '/local_nvme'
Filesystem path for NVMe device for optimizer state offloading
pin_memory: bool, default: False
Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead.
pipeline: bool, default: False
pipeline activated (will default to True if either pipeline_read or pipeline_write is set
pipeline_read: bool, default: False
activate pipeline read (deepspeed has limited docs for what this does)
pipeline_write: bool, default: False
activate pipeline write(deepspeed has limited docs for what this does)
"""
buffer_count: int = 4
device: OffloadDevice = "cpu"
fast_init: bool = False
nvme_path: str = "/local_nvme"
pin_memory: bool = False
pipeline: bool = False
pipeline_read: bool = False
pipeline_write: bool = False
@attr.s(auto_attribs=True)
class DeepspeedOffloadParamConfig:
"""Deepspeed parameter offloading configuration class
Attributes
----------
buffer_count: int, default: 5
Number of buffers in buffer pool for parameter offloading to NVMe
buffer_size: int, default: int(1E8)
Size of buffers in buffer pool for parameter offloading to NVMe
device: OffloadDevice, default: 'cpu'
Device memory to offload model parameters
max_in_cpu: int, default: int(1E9)
Number of parameter elements to maintain in CPU memory when offloading to NVMe is enabled.
nvme_path: str, default: '/local_nvme'
Filesystem path for NVMe device for parameter offloading
pin_memory: bool, default: False
Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead.
"""
buffer_count: int = 5
buffer_size: int = int(1e8)
device: OffloadDevice = "cpu"
max_in_cpu: int = int(1e9)
nvme_path: str = "/local_nvme"
pin_memory: bool = False
@attr.s(auto_attribs=True)
class DeepspeedPLDConfig:
"""
Attributes
----------
theta: float, default: 1.0
Hyper-parameter that controls the trade-off between training time and robustness. The lower the theta value,
the faster the training speed
gamma: float, default: 0.001
Hyper-parameter that controls how fast the drop ratio increases
"""
theta: float = 1.0
gamma: float = 0.001
@attr.s(auto_attribs=True)
class DeepspeedTensorboardConfig:
"""Deepspeed Tensorboard configuration class
Attributes
----------
output_path: str, default: ''
Tensorboard output path
job_name: str, default: 'DeepSpeedJobName'
Tensorboard job name
"""
output_path: str = ""
job_name: str = "DeepSpeedJobName"
@attr.s(auto_attribs=True)
class DeepspeedZeROConfig:
"""Deepspeed ZeRO configuration class
Attributes
----------
allgather_bucket_size: int, default: int(5E8)
Number of elements allgathered at a time. Limits the memory required for the allgather for large model sizes
allgather_partitions: bool, default: True
Chooses between allgather collective or a series of broadcast collectives to gather updated parameters
from all the GPUs at the end of each step
contiguous_gradients: bool, default: False
Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward
pass. Only useful when running very large models.
ignore_unused_parameters: bool, default: True
Now just used in stage2 complete_grad_norm_calculation_for_cpu_offload
Enable this option to avoid -- https://github.com/microsoft/DeepSpeed/issues/707
legacy_stage1: bool, default: False
Use deepspeed < v0.3.17 zero stage 1, kept for backwards compatability reasons
offload_optimizer: Optional[DeepspeedOffloadOptimizerConfig], default: None
Enable offloading of optimizer state to CPU or NVMe, and optimizer computation to CPU. This frees up GPU
memory for larger models or batch sizes. Valid only with stage 3
offload_param: Optional[DeepspeedOffloadParamConfig], default: None
Enable offloading of model parameters to CPU or NVMe. This frees up GPU memory for larger models or batch
sizes. Valid only with stage 3.
overlap_comm: bool, default: False
Attempts to overlap the reduction of the gradients with backward computation
reduce_bucket_size: int, default: int(5E8)
Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large
model sizes
reduce_scatter: bool, default: True
Uses reduce or reduce scatter instead of allreduce to average gradients
stage: int, default: 0
Chooses different stages of ZeRO Optimizer. Stage 0, 1, 2, and 3 refer to disabled, optimizer state
partitioning, and optimizer+gradient state partitioning, and optimizer+gradient+parameter partitioning,
respectively
stage3_max_live_parameters: int, default: int(1E9)
The maximum number of parameters resident per GPU before releasing. Smaller values use less memory, but
perform more communication.
stage3_max_reuse_distance: int, default: int(1E9)
Do not release a parameter if it will be reused within this threshold of parameters. Smaller values use less
memory, but perform more communication.
stage3_prefetch_bucket_size: int, default: int(5E8)
The size of the fixed buffer for prefetching parameters. Smaller values use less memory, but can increase
stalls due to communication.
stage3_param_persistence_threshold: int, default: int(1E6)
Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly
increase communication (especially latency-bound messages).
stage3_gather_fp16_weights_on_model_save: bool, default: False
Consolidate the weights before saving the model by save_fp16_model(). Since the weights are partitioned
across GPUs, they aren’t part of state_dict, so this function automatically gather the weights when this
option is enabled and then saves the fp16 model weights.
sub_group_size: int, default: int(1E12)
sub_group_size controls the granularity in which parameters are updated during optimizer steps. Parameters are
grouped into buckets of sub_group_size and each buckets is updated one at a time.
"""
allgather_bucket_size: int = int(5e8)
allgather_partitions: bool = True
contiguous_gradients: bool = False
ignore_unused_parameters: bool = True
legacy_stage1: bool = False
offload_optimizer: Optional[DeepspeedOffloadOptimizerConfig] = None
offload_param: Optional[DeepspeedOffloadParamConfig] = None
overlap_comm: bool = False
reduce_bucket_size: int = int(5e8)
reduce_scatter: bool = True
stage: int = 0
stage3_max_live_parameters: int = int(1e9)
stage3_max_reuse_distance: int = int(1e9)
stage3_prefetch_bucket_size: int = int(5e8)
stage3_param_persistence_threshold: int = int(1e6)
stage3_gather_fp16_weights_on_model_save: bool = False
sub_group_size: int = int(1e12)
@attr.s(auto_attribs=True)
class DeepspeedConfig:
"""Deepspeed configuration class
Composed of other configuration classes related to specific functionality
Attributes
----------
activation_checkpointing: Optional[DeepspeedActivationCheckpointingConfig], default: DeepspeedActivationCheckpointingConfig()
Enables and configures activation checkpointing
aio: Optional[DeepspeedAIOConfig], default: DeepspeedAIOConfig()
Configuring the asynchronous I/O module for offloading parameter and optimizer states to persistent
(NVMe) storage
auto_mpi_discovery: bool, default: True
if distributed environment variables are not set, attempt to discover them from MPI
disable_allgather: bool, default: False
Disables allgather
dist_backend: BackendOptions, default: 'nccl'
Which communication backend to use
distributed_port: int, default: 29500
torch distributed backend port
dump_state: bool, default: False
Print out state information of DeepSpeed object after initialization
flops_profiler: Optional[DeepspeedFlopsConfig], default: None
Enables and configures the flops profiler. This would also enable wall_clock_breakdown
fp16: Optional[DeepspeedFP16Config], default: None
Enables and configures mixed precision/FP16 training that leverages NVIDIA’s Apex package
fp32_allreduce: bool, default: False
During gradient averaging perform allreduce with 32 bit values
gradient_predivide_factor: float, default: 1.0
Before gradient averaging predivide gradients by a specified factor, can sometimes help with fp16 stability
when scaling to large numbers of GPUs
init_method: str, default: 'env://'
URL specifying how to initialize the process group
prescale_gradients: float, default: 1.0
Scale gradients before doing allreduce
progressive_layer_drop: Optional[DeepspeedPLDConfig], default: None
Enables and configures progressive layer dropping
sparse_gradients: bool, default: False
Enable sparse compression of torch.nn.Embedding gradients
steps_per_print: int, default: 10
Print train loss every N steps
tensorboard: Optional[DeepspeedTensorboardConfig], default: None
Enables and configures tensorboard support
verbose: bool, default: True
flag to make deepspeed engine verbose with information
wall_clock_breakdown: bool, default: False
Enable timing of the latency of forward/backward/update training phases
zero_optimization: Optional[DeepspeedZeROConfig], default: DeepspeedZeROConfig()
Enables and configures ZeRO memory optimizations
Notes
-----
Deepspeed does not use Apex’s AMP mode whihc allows for more flexibility in mixed precision training modes. FP16
here is similar to AMP’s O2 mode
"""
activation_checkpointing: Optional[
DeepspeedActivationCheckpointingConfig
] = DeepspeedActivationCheckpointingConfig()
aio: Optional[DeepspeedAIOConfig] = DeepspeedAIOConfig()
auto_mpi_discovery: bool = True
disable_allgather: bool = False
dist_backend: BackendOptions = "nccl"
distributed_port: int = 29500
dump_state: bool = False
flops_profiler: Optional[DeepspeedFlopsConfig] = None
fp16: Optional[DeepspeedFP16Config] = None
fp32_allreduce: bool = False
gradient_predivide_factor: float = 1.0
init_method: str = "env://"
prescale_gradients: bool = False
progressive_layer_drop: Optional[DeepspeedPLDConfig] = None
sparse_gradients: bool = False
steps_per_print: int = 10
tensorboard: Optional[DeepspeedTensorboardConfig] = None
verbose: bool = True
wall_clock_breakdown: bool = False
zero_optimization: Optional[DeepspeedZeROConfig] = DeepspeedZeROConfig()
@attr.s(auto_attribs=True)
class FairscaleOSSConfig:
"""Fairscale optimizer state sharding configuration class
Attributes
----------
broadcast_fp16: bool, default: False
Compress the model shards in fp16 before sharing them in between ranks. This is safe to use when PyTorch AMP
is activated. Without torch AMP this will lead to a slight degradation in terms of accuracy.
"""
broadcast_fp16: bool = False
@attr.s(auto_attribs=True)
class FairscaleSDDPConfig:
"""Fairscale sharded data parallel (SDDP) configuration class
Attributes
----------
auto_refresh_trainable: bool, default: True
Check whether the parameters trainability (requires_grad) has changed and update both ShardedDDP and OSS
automatically if this is the case. If set to False, refresh_trainable() needs to be called anytime a
parameter is frozen or unfrozen
broadcast_buffers: bool, default: True
Whether to additionally broadcast model buffers in between ranks at the beginning of each forward pass. Same
setting as in Pytorch DDP, this is in addition to the broadcast and reduction of the model parameters.
reduce_buffer_size: int, default: 2 ** 23
he max size of the buffer used to batch the small parameter tensors, in number of elements. This will impact
the long term memory consumption, because these buckets correspond to parameters which will not be sharded.
Set to 0 to remove all bucketing, 1M to 8M is usually reasonable.
reduce_fp16: bool, default: False
cast the grads to fp16 before reducing. Not needed if the model is already fp16, but will probably improve
performance for multi node jobs using PyTorch AMP. The effect is similar to DDP’s fp16_compress_hook and
will also save some memory.
sync_models_at_startup: bool, default: True
Synchronize the models in between the ranks when starting up. Not needed if each rank has the same seed, or
the training restarts from a saved state
"""
auto_refresh_trainable: bool = True
broadcast_buffers: bool = True
reduce_buffer_size: int = 2 ** 23
reduce_fp16: bool = False
sync_models_at_startup: bool = True
@attr.s(auto_attribs=True)
class FairscaleFSDPConfig:
"""Fairscale Fully Sharded Data Parallel configuration class
Attributes
----------
bucket_cap_mb: int, default: 25
FSDP will bucket parameters so that gradient reduction can be more efficient for small parameters.
bucket_cap_mb controls the bucket size in MegaBytes (MB). Buckets are sub-divided based on world_size, so the
max shard size is roughly bucket_cap_mb / world_size. There is one bucketer (with potentially multiple
bucket_cap_mb sized buffers shared by all FSDP instances. Large gradient tensors are directly reduced without
using the buffers. The buffers are there to reduce communication overhead for small tensors. Overlapping with
computation happens due to use of a different CUDA stream than the computation CUDA stream. The total memory
overhead per buffer is around bucket_cap_mb / world_size * (world_size + 1). The buffers are allocated during
the backward pass and freed at the end of the backward pass to save more memory for other phases of the
training process. Note, the memory vs. speed tradeoff of bucket size is very different from that of the DDP
engine. In DDP, the buffer size 1MB + n*cap_mb, until n is big enough to cover the entire model size. The
order of which buffer is ready there is more rigid and DDP requires all gradients to be computed in the
backward. In FSDP, the buffer size does not change with model size (it changes based on number of
<dtype, device, process_group> tuples) and gradient ready order matters little since FSDP has a final flush
call that ensures everything is reduced and not all gradients need to be upfront known. Overlapping with
compute is done differently too. Values <= 0 disable bucketing
buffer_dtype: Optional[torch.dtype], default: None
dtype for buffers for computation. defaults to value of compute_dtype
clear_autocast_cache: bool, default: False
When using mixed precision training with FP16 AMP, if the model weights are in FP32, autocast
maintains a cache for downcasted weights. The cache can cause GPU OOM during the forward pass. Setting this
flag to true will help clearing this cache as inner FSDP instances finish part of the forward pass to save
GPU memory
compute_dtype: Optional[torch.dtype], default: None
dtype for full parameters for computation. This defaults to torch.float32 unless FP 16 AMP is set,
in which case it defaults to torch.float16.
flatten_parameters: bool, default: True
flatten parameters into a single contiguous tensor, which improves training speed
force_input_to_fp32: bool, default: False:
force input floating point tensors to be FP32 (if they are FP16) when the FSDP instance is in full precision
mode. This helps avoid issues of running SyncBatchNorm with AMP and checkpoint_wrapper.
fp32_reduce_scatter: bool, default: False
reduce-scatter gradients in FP32. This is only relevant when FP16 AMP is used
gradient_predivide_factor: Optional[float], default: None
divide factor before the reduction
gradient_postdivide_factor: Optional[float], default: None
divide factor after the reduction
move_grads_to_cpu: Optional[bool], default: None
move gradient shard to CPU after reduction. This is only relevant when FP16 AMP is used
move_params_to_cpu: bool, default: False
offload FP32 params to CPU. This is only relevant when FP16 AMP is used
no_broadcast_optim_state: Optional[bool], default: False
do not broadcast this modules optimizer state when gather_full_optim_state_dict is called. If you set this
true, you are expected to overwrite the relevant state entries of the returned optimizer state dict with the
proper state at each rank. This is useful for situations, like Mixture Of Experts, where all but a few
parameters can fit on one node
reshard_after_forward: bool, default: True
reshard parameters after the forward pass. This saves memory but slows training. This is only relevant
when resharding individual layers (see https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html)
verbose: bool, default: True
turn on verbose output for model’s string representation
Notes
-----
mixed_precision: bool
This value will automatically be set from the Stoke FP16 selected option (AMP only)
state_dict_device: torch.device
this is not exposed as it should be managed internally from the DDP backend setup
compute_device: torch.device
this is not exposed as it should be managed internally from the DDP backend setup
"""
bucket_cap_mb: int = 25
buffer_dtype: Optional[torch.dtype] = None
clear_autocast_cache: bool = False
compute_dtype: Optional[torch.dtype] = None
flatten_parameters: bool = True
force_input_to_fp32: bool = False
fp32_reduce_scatter: bool = False
gradient_predivide_factor: Optional[float] = None
gradient_postdivide_factor: Optional[float] = None
move_grads_to_cpu: Optional[bool] = None
move_params_to_cpu: bool = False
no_broadcast_optim_state: Optional[bool] = False
reshard_after_forward: bool = True
verbose: bool = False
@attr.s(auto_attribs=True)
class HorovodConfig:
"""Horovod configuration class
Attributes
----------
compression: bool, default: False
Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter
update step.
convert_to_sync_batch_norm: bool, default: False
Automatically convert all batch norm calls to horovod.torch.SyncBatchNorm calls
https://horovod.readthedocs.io/en/stable/api.html#horovod.torch.SyncBatchNorm
gradient_predivide_factor: float, default: 1.0
If op == Average, gradient_predivide_factor splits the averaging before and after the sum. Gradients are scaled
by 1.0 / gradient_predivide_factor before the sum and gradient_predivide_factor / size after the sum.
op: HorovodOps, default: 'Average'
The reduction operation to use when combining gradients across different ranks.
"""
compression: bool = False
convert_to_sync_batch_norm: bool = False
gradient_predivide_factor: float = 1.0
op: HorovodOps = "Average"
class StokeOptimizer(TypedDict):
"""Stoke optimizer wrapper class
Given all the different backends and extensions the optimizer might need to be instantiated in a different way
thus this typed dict holds the configuration without instantiation
Attributes
----------
optimizer: Type[torch.optim.Optimizer]
un-instantiated torch.optim.Optimizer class
optimizer_kwargs: Dict
any keyword args to be unrolled into the optimizer at instantiation time
"""
optimizer: Type[torch.optim.Optimizer]
optimizer_kwargs: Dict
Classes
AMPConfig
class AMPConfig(
backoff_factor: float = 0.5,
growth_factor: float = 2.0,
growth_interval: int = 2000,
init_scale: float = 65536.0
)
Attributes
Name | Type | Description | Default |
---|---|---|---|
backoff_factor | float, default: 0.5 | Factor by which the scale is multiplied during update if inf/NaN gradients occur in an iteration | None |
growth_factor | float, default: 2.0 | Factor by which the scale is multiplied during update if no inf/NaN gradients occur for growth_interval consecutive iterations. | None |
growth_interval | int, default: 2000 | Number of consecutive iterations without inf/NaN gradients that must occur for the scale to be multiplied by | |
growth_factor | None | ||
init_scale | float, default: 2.**16 | Initial scale factor | None |
??? example "View Source" class AMPConfig:
"""PyTorch AMP configuration class
Attributes
----------
backoff_factor : float, default: 0.5
Factor by which the scale is multiplied during update if inf/NaN gradients occur in an iteration
growth_factor : float, default: 2.0
Factor by which the scale is multiplied during update if no inf/NaN gradients occur for growth_interval consecutive iterations.
growth_interval : int, default: 2000
Number of consecutive iterations without inf/NaN gradients that must occur for the scale to be multiplied by
growth_factor
init_scale : float, default: 2.**16
Initial scale factor
"""
backoff_factor: float = 0.5
growth_factor: float = 2.0
growth_interval: int = 2000
init_scale: float = 2.0 ** 16
ApexConfig
class ApexConfig(
cast_model_outputs: Union[torch.dtype, NoneType] = None,
convert_to_sync_batch_norm: bool = False,
max_loss_scale: float = 16777216.0,
min_loss_scale: Union[float, NoneType] = None,
scaler_per_loss: bool = False,
verbosity: int = 0
)
Attributes
Name | Type | Description | Default |
---|---|---|---|
cast_model_outputs | Optional[torch.dtype], default: None | Option to ensure that the outputs of your model(s) are always cast to a particular type regardless of opt_level | None |
convert_to_sync_batch_norm | bool, default: False | Automatically convert all batch norm calls to apex.parallel.SyncBatchNorm calls | |
https://nvidia.github.io/apex/parallel.html#apex.parallel.SyncBatchNorm | None | ||
max_loss_scale | float, default: 2.**24 | Sets a ceiling for the loss scale values that can be chosen by dynamic loss scaling | None |
min_loss_scale | Optional[float], default: None | Sets a floor for the loss scale values that can be chosen by dynamic loss scaling. The default value of None | |
means that no floor is imposed | value | ||
scaler_per_loss | bool, default: False | Option to impose a scaler for each loss instead of a global scaler | None |
verbosity | int, default: 0 | Set to 0 to suppress Amp-related output | None |
??? example "View Source" class ApexConfig:
"""Nvidia APEX configuration class
Attributes
----------
cast_model_outputs: Optional[torch.dtype], default: None
Option to ensure that the outputs of your model(s) are always cast to a particular type regardless of opt_level
convert_to_sync_batch_norm: bool, default: False
Automatically convert all batch norm calls to apex.parallel.SyncBatchNorm calls
https://nvidia.github.io/apex/parallel.html#apex.parallel.SyncBatchNorm
max_loss_scale: float, default: 2.**24
Sets a ceiling for the loss scale values that can be chosen by dynamic loss scaling
min_loss_scale: Optional[float], default: None
Sets a floor for the loss scale values that can be chosen by dynamic loss scaling. The default value of None
means that no floor is imposed
scaler_per_loss: bool, default: False
Option to impose a scaler for each loss instead of a global scaler
verbosity: int, default: 0
Set to 0 to suppress Amp-related output
"""
cast_model_outputs: Optional[torch.dtype] = None
convert_to_sync_batch_norm: bool = False
max_loss_scale: float = 2.0 ** 24
min_loss_scale: Optional[float] = None
scaler_per_loss: bool = False
verbosity: int = 0
BackendOptions
class BackendOptions(
/,
*args,
**kwargs
)
??? example "View Source" class BackendOptions(Enum):
"""Communication backend options"""
nccl = "nccl"
mpi = " mpi"
gloo = "gloo"
Ancestors (in MRO)
- enum.Enum
Class variables
gloo
mpi
name
nccl
value
ClipGradConfig
class ClipGradConfig(
clip_value: float
)
Attributes
Name | Type | Description | Default |
---|---|---|---|
clip_value | float | maximum allowed absolute value of the gradients [-clip_value, clip_value] | None |
??? example "View Source" class ClipGradConfig:
"""Gradient clipping by value configuration class
Attributes
----------
clip_value: float
maximum allowed absolute value of the gradients [-clip_value, clip_value]
"""
clip_value: float
ClipGradNormConfig
class ClipGradNormConfig(
max_norm: float,
norm_type: float
)
Attributes
Name | Type | Description | Default |
---|---|---|---|
max_norm | float | max norm of the gradients | None |
norm_type | float | type of the used p-norm | None |
??? example "View Source" class ClipGradNormConfig:
"""Gradient clipping by p-norm configuration class
Attributes
----------
max_norm: float
max norm of the gradients
norm_type: float
type of the used p-norm
"""
max_norm: float
norm_type: float
DDPConfig
class DDPConfig(
local_rank: Union[int, NoneType],
auto_mpi_discovery: bool = False,
convert_to_sync_batch_norm: bool = False,
backend: stoke.configs.BackendOptions = 'nccl',
broadcast_buffers: bool = True,
bucket_cap_mb: int = 25,
find_unused_parameters: bool = False,
gradient_as_bucket_view: bool = False,
init_method: str = 'env://',
no_sync: bool = True
)
Attributes
Name | Type | Description | Default |
---|---|---|---|
local_rank | Optional[int] | Current local rank of the device (provided here, as LOCAL_RANK env var, or parsed from --local_arg) | None |
auto_mpi_discovery | bool, default: False | if distributed environment variables are not set, attempt to discover them from MPI (using underlying deepspeed | |
function call) | None | ||
convert_to_sync_batch_norm | bool, default: False | Automatically convert all batch norm calls to torch.nn.SyncBatchNorm calls | |
https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html | None | ||
backend | BackendOptions, default: 'nccl' | Which communication backend to use | None |
broadcast_buffers | bool, default: True | Flag that enables syncing (broadcasting) buffers of the module at beginning of the forward function | None |
bucket_cap_mb | int, default: 25 | DistributedDataParallel will bucket parameters into multiple buckets so that gradient reduction of each bucket | |
can potentially overlap with backward computation. bucket_cap_mb controls the bucket size in MegaBytes (MB) | None | ||
find_unused_parameters | bool, default: False | Traverse the autograd graph from all tensors contained in the return value of the wrapped module’s forward | |
function. Parameters that don’t receive gradients as part of this graph are preemptively marked as being ready | |||
to be reduced. Note that all forward outputs that are derived from module parameters must participate in | |||
calculating loss and later the gradient computation. If they don’t, this wrapper will hang waiting for autograd | |||
to produce gradients for those parameters. Any outputs derived from module parameters that are otherwise unused | |||
can be detached from the autograd graph using torch.Tensor.detach | None | ||
gradient_as_bucket_view | bool, default: False | When set to True, gradients will be views pointing to different offsets of allreduce communication | |
buckets. This can reduce peak memory usage, where the saved memory size will be equal to the total gradients | |||
size. Moreover, it avoids the overhead of copying between gradients and allreduce communication buckets. When | |||
gradients are views, detach_() cannot be called on the gradients. If hitting such errors, please fix it by | |||
referring to the zero_grad() function in torch/optim/optimizer.py as a solution. | None | ||
init_method | str, default: 'env://' | URL specifying how to initialize the process group | None |
no_sync | bool, default: True | for any DDP based method (including SDDP and FSDP wrappers -- if activated gradients will be accumulated on | |
module variables, which will later be synchronized in the first forward-backward pass after exiting the | |||
context. no sync might lead to higher memory usage but lower communication overhead | None |
??? example "View Source" class DDPConfig:
"""PyTorch DistributedDataParallel configuration class
Attributes
----------
local_rank: Optional[int]
Current local rank of the device (provided here, as LOCAL_RANK env var, or parsed from --local_arg)
auto_mpi_discovery: bool, default: False
if distributed environment variables are not set, attempt to discover them from MPI (using underlying deepspeed
function call)
convert_to_sync_batch_norm: bool, default: False
Automatically convert all batch norm calls to torch.nn.SyncBatchNorm calls
https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html
backend: BackendOptions, default: 'nccl'
Which communication backend to use
broadcast_buffers: bool, default: True
Flag that enables syncing (broadcasting) buffers of the module at beginning of the forward function
bucket_cap_mb: int, default: 25
DistributedDataParallel will bucket parameters into multiple buckets so that gradient reduction of each bucket
can potentially overlap with backward computation. bucket_cap_mb controls the bucket size in MegaBytes (MB)
find_unused_parameters: bool, default: False
Traverse the autograd graph from all tensors contained in the return value of the wrapped module’s forward
function. Parameters that don’t receive gradients as part of this graph are preemptively marked as being ready
to be reduced. Note that all forward outputs that are derived from module parameters must participate in
calculating loss and later the gradient computation. If they don’t, this wrapper will hang waiting for autograd
to produce gradients for those parameters. Any outputs derived from module parameters that are otherwise unused
can be detached from the autograd graph using torch.Tensor.detach
gradient_as_bucket_view: bool, default: False
When set to True, gradients will be views pointing to different offsets of allreduce communication
buckets. This can reduce peak memory usage, where the saved memory size will be equal to the total gradients
size. Moreover, it avoids the overhead of copying between gradients and allreduce communication buckets. When
gradients are views, detach_() cannot be called on the gradients. If hitting such errors, please fix it by
referring to the zero_grad() function in torch/optim/optimizer.py as a solution.
init_method: str, default: 'env://'
URL specifying how to initialize the process group
no_sync: bool, default: True
for any DDP based method (including SDDP and FSDP wrappers -- if activated gradients will be accumulated on
module variables, which will later be synchronized in the first forward-backward pass after exiting the
context. no sync might lead to higher memory usage but lower communication overhead
"""
local_rank: Optional[int]
auto_mpi_discovery: bool = False
convert_to_sync_batch_norm: bool = False
backend: BackendOptions = "nccl"
broadcast_buffers: bool = True
bucket_cap_mb: int = 25
find_unused_parameters: bool = False
gradient_as_bucket_view: bool = False
init_method: str = "env://"
no_sync: bool = True
DeepspeedAIOConfig
class DeepspeedAIOConfig(
block_size: int = 1048576,
ignore_unused_parameters: bool = True,
overlap_events: bool = True,
queue_depth: int = 8,
single_submit: bool = False,
thread_count: int = 1
)
Attributes
Name | Type | Description | Default |
---|---|---|---|
block_size | int, default: 1048576 | I/O block size in bytes | None |
ignore_unused_parameters | bool, default: True | Unused parameters in modules may be unexpected in static networks, but could be normal in dynamic networks. | |
This controls whether or not training should terminate with an error message when unused parameters are | |||
detected. | None | ||
overlap_events | bool, default: True | Submit requests to storage device in an overlapped fashion without waiting for completion of earlier requests. | None |
queue_depth | int, default: 8 | I/O queue depth | None |
single_submit | bool, default: False | Submit requests to storage device as multiple individual requests as opposed to one block of requests. | None |
thread_count | int, default: 1 | Intra-request parallelism for each read/write submitted by a user thread. | None |
??? example "View Source" class DeepspeedAIOConfig:
"""Deepspeed asynchronous I/O configuration class
Attributes
----------
block_size: int, default: 1048576
I/O block size in bytes
ignore_unused_parameters: bool, default: True
Unused parameters in modules may be unexpected in static networks, but could be normal in dynamic networks.
This controls whether or not training should terminate with an error message when unused parameters are
detected.
overlap_events: bool, default: True
Submit requests to storage device in an overlapped fashion without waiting for completion of earlier requests.
queue_depth: int, default: 8
I/O queue depth
single_submit: bool, default: False
Submit requests to storage device as multiple individual requests as opposed to one block of requests.
thread_count: int, default: 1
Intra-request parallelism for each read/write submitted by a user thread.
"""
block_size: int = 1048576
ignore_unused_parameters: bool = True
overlap_events: bool = True
queue_depth: int = 8
single_submit: bool = False
thread_count: int = 1
DeepspeedActivationCheckpointingConfig
class DeepspeedActivationCheckpointingConfig(
contiguous_memory_optimization: bool = False,
cpu_checkpointing: bool = False,
number_checkpoints: Union[int, NoneType] = None,
partition_activations: bool = False,
profile: bool = False,
synchronize_checkpoint_boundary: bool = False
)
Attributes
Name | Type | Description | Default |
---|---|---|---|
contiguous_memory_optimization | bool, default: False | Copies partitioned activations so that they are contiguous in memory | None |
cpu_checkpointing | bool, default: False | Offloads partitioned activations to CPU if partition_activations is enabled | None |
number_checkpoints | Optional[int], default: None | Total number of activation checkpoints used to allocate memory buffer for contiguous_memoty_optimization | None |
partition_activations | bool, default: False | Enables partition activation when used with model parallelism | None |
profile | bool, default: False | Logs the forward and backward time for each checkpoint function | None |
synchronize_checkpoint_boundary | bool, default: False | Inserts torch.cuda.synchronize() at each checkpoint boundary | None |
??? example "View Source" class DeepspeedActivationCheckpointingConfig:
"""Deepspeed activation checkpointing configuration class
Attributes
----------
contiguous_memory_optimization: bool, default: False
Copies partitioned activations so that they are contiguous in memory
cpu_checkpointing: bool, default: False
Offloads partitioned activations to CPU if partition_activations is enabled
number_checkpoints: Optional[int], default: None
Total number of activation checkpoints used to allocate memory buffer for contiguous_memoty_optimization
partition_activations: bool, default: False
Enables partition activation when used with model parallelism
profile: bool, default: False
Logs the forward and backward time for each checkpoint function
synchronize_checkpoint_boundary: bool, default: False
Inserts torch.cuda.synchronize() at each checkpoint boundary
"""
contiguous_memory_optimization: bool = False
cpu_checkpointing: bool = False
number_checkpoints: Optional[int] = None
partition_activations: bool = False
profile: bool = False
synchronize_checkpoint_boundary: bool = False
DeepspeedConfig
class DeepspeedConfig(
activation_checkpointing: Union[stoke.configs.DeepspeedActivationCheckpointingConfig, NoneType] = DeepspeedActivationCheckpointingConfig(contiguous_memory_optimization=False, cpu_checkpointing=False, number_checkpoints=None, partition_activations=False, profile=False, synchronize_checkpoint_boundary=False),
aio: Union[stoke.configs.DeepspeedAIOConfig, NoneType] = DeepspeedAIOConfig(block_size=1048576, ignore_unused_parameters=True, overlap_events=True, queue_depth=8, single_submit=False, thread_count=1),
auto_mpi_discovery: bool = True,
disable_allgather: bool = False,
dist_backend: stoke.configs.BackendOptions = 'nccl',
distributed_port: int = 29500,
dump_state: bool = False,
flops_profiler: Union[stoke.configs.DeepspeedFlopsConfig, NoneType] = None,
fp16: Union[stoke.configs.DeepspeedFP16Config, NoneType] = None,
fp32_allreduce: bool = False,
gradient_predivide_factor: float = 1.0,
init_method: str = 'env://',
prescale_gradients: bool = False,
progressive_layer_drop: Union[stoke.configs.DeepspeedPLDConfig, NoneType] = None,
sparse_gradients: bool = False,
steps_per_print: int = 10,
tensorboard: Union[stoke.configs.DeepspeedTensorboardConfig, NoneType] = None,
verbose: bool = True,
wall_clock_breakdown: bool = False,
zero_optimization: Union[stoke.configs.DeepspeedZeROConfig, NoneType] = DeepspeedZeROConfig(allgather_bucket_size=500000000, allgather_partitions=True, contiguous_gradients=False, ignore_unused_parameters=True, legacy_stage1=False, offload_optimizer=None, offload_param=None, overlap_comm=False, reduce_bucket_size=500000000, reduce_scatter=True, stage=0, stage3_max_live_parameters=1000000000, stage3_max_reuse_distance=1000000000, stage3_prefetch_bucket_size=500000000, stage3_param_persistence_threshold=1000000, stage3_gather_fp16_weights_on_model_save=False, sub_group_size=1000000000000)
)
Attributes
Name | Type | Description | Default |
---|---|---|---|
activation_checkpointing | Optional[DeepspeedActivationCheckpointingConfig], default: DeepspeedActivationCheckpointingConfig() | Enables and configures activation checkpointing | None |
aio | Optional[DeepspeedAIOConfig], default: DeepspeedAIOConfig() | Configuring the asynchronous I/O module for offloading parameter and optimizer states to persistent | |
(NVMe) storage | None | ||
auto_mpi_discovery | bool, default: True | if distributed environment variables are not set, attempt to discover them from MPI | None |
disable_allgather | bool, default: False | Disables allgather | None |
dist_backend | BackendOptions, default: 'nccl' | Which communication backend to use | None |
distributed_port | int, default: 29500 | torch distributed backend port | None |
dump_state | bool, default: False | Print out state information of DeepSpeed object after initialization | None |
flops_profiler | Optional[DeepspeedFlopsConfig], default: None | Enables and configures the flops profiler. This would also enable wall_clock_breakdown | None |
fp16 | Optional[DeepspeedFP16Config], default: None | Enables and configures mixed precision/FP16 training that leverages NVIDIA’s Apex package | None |
fp32_allreduce | bool, default: False | During gradient averaging perform allreduce with 32 bit values | None |
gradient_predivide_factor | float, default: 1.0 | Before gradient averaging predivide gradients by a specified factor, can sometimes help with fp16 stability | |
when scaling to large numbers of GPUs | None | ||
init_method | str, default: 'env://' | URL specifying how to initialize the process group | None |
prescale_gradients | float, default: 1.0 | Scale gradients before doing allreduce | None |
progressive_layer_drop | Optional[DeepspeedPLDConfig], default: None | Enables and configures progressive layer dropping | None |
sparse_gradients | bool, default: False | Enable sparse compression of torch.nn.Embedding gradients | None |
steps_per_print | int, default: 10 | Print train loss every N steps | None |
tensorboard | Optional[DeepspeedTensorboardConfig], default: None | Enables and configures tensorboard support | None |
verbose | bool, default: True | flag to make deepspeed engine verbose with information | None |
wall_clock_breakdown | bool, default: False | Enable timing of the latency of forward/backward/update training phases | None |
zero_optimization | Optional[DeepspeedZeROConfig], default: DeepspeedZeROConfig() | Enables and configures ZeRO memory optimizations | None |
??? example "View Source" class DeepspeedConfig:
"""Deepspeed configuration class
Composed of other configuration classes related to specific functionality
Attributes
----------
activation_checkpointing: Optional[DeepspeedActivationCheckpointingConfig], default: DeepspeedActivationCheckpointingConfig()
Enables and configures activation checkpointing
aio: Optional[DeepspeedAIOConfig], default: DeepspeedAIOConfig()
Configuring the asynchronous I/O module for offloading parameter and optimizer states to persistent
(NVMe) storage
auto_mpi_discovery: bool, default: True
if distributed environment variables are not set, attempt to discover them from MPI
disable_allgather: bool, default: False
Disables allgather
dist_backend: BackendOptions, default: 'nccl'
Which communication backend to use
distributed_port: int, default: 29500
torch distributed backend port
dump_state: bool, default: False
Print out state information of DeepSpeed object after initialization
flops_profiler: Optional[DeepspeedFlopsConfig], default: None
Enables and configures the flops profiler. This would also enable wall_clock_breakdown
fp16: Optional[DeepspeedFP16Config], default: None
Enables and configures mixed precision/FP16 training that leverages NVIDIA’s Apex package
fp32_allreduce: bool, default: False
During gradient averaging perform allreduce with 32 bit values
gradient_predivide_factor: float, default: 1.0
Before gradient averaging predivide gradients by a specified factor, can sometimes help with fp16 stability
when scaling to large numbers of GPUs
init_method: str, default: 'env://'
URL specifying how to initialize the process group
prescale_gradients: float, default: 1.0
Scale gradients before doing allreduce
progressive_layer_drop: Optional[DeepspeedPLDConfig], default: None
Enables and configures progressive layer dropping
sparse_gradients: bool, default: False
Enable sparse compression of torch.nn.Embedding gradients
steps_per_print: int, default: 10
Print train loss every N steps
tensorboard: Optional[DeepspeedTensorboardConfig], default: None
Enables and configures tensorboard support
verbose: bool, default: True
flag to make deepspeed engine verbose with information
wall_clock_breakdown: bool, default: False
Enable timing of the latency of forward/backward/update training phases
zero_optimization: Optional[DeepspeedZeROConfig], default: DeepspeedZeROConfig()
Enables and configures ZeRO memory optimizations
Notes
-----
Deepspeed does not use Apex’s AMP mode whihc allows for more flexibility in mixed precision training modes. FP16
here is similar to AMP’s O2 mode
"""
activation_checkpointing: Optional[
DeepspeedActivationCheckpointingConfig
] = DeepspeedActivationCheckpointingConfig()
aio: Optional[DeepspeedAIOConfig] = DeepspeedAIOConfig()
auto_mpi_discovery: bool = True
disable_allgather: bool = False
dist_backend: BackendOptions = "nccl"
distributed_port: int = 29500
dump_state: bool = False
flops_profiler: Optional[DeepspeedFlopsConfig] = None
fp16: Optional[DeepspeedFP16Config] = None
fp32_allreduce: bool = False
gradient_predivide_factor: float = 1.0
init_method: str = "env://"
prescale_gradients: bool = False
progressive_layer_drop: Optional[DeepspeedPLDConfig] = None
sparse_gradients: bool = False
steps_per_print: int = 10
tensorboard: Optional[DeepspeedTensorboardConfig] = None
verbose: bool = True
wall_clock_breakdown: bool = False
zero_optimization: Optional[DeepspeedZeROConfig] = DeepspeedZeROConfig()
DeepspeedFP16Config
class DeepspeedFP16Config(
hysteresis: int = 2,
initial_scale_power: int = 32,
loss_scale: float = 0.0,
loss_scale_window: int = 1000,
min_loss_scale: int = 1000
)
Attributes
Name | Type | Description | Default |
---|---|---|---|
hysteresis | int, default: 2 | represents the delay shift in dynamic loss scaling | None |
initial_scale_power | int, default: 32 | power of the initial dynamic loss scale value. The actual loss scale is computed as 2 ** initial_scale_power | None |
loss_scale | float, default: 0.0 | loss scaling value for FP16 training (0.0 --> dynamic scaling) | None |
loss_scale_window | int, default: 1000 | the window over which to raise/lower the dynamic loss scale value | None |
min_loss_scale | int, default: 1000 | minimum dynamic loss scale value | None |
??? example "View Source" class DeepspeedFP16Config:
"""Deepspeed FP16 configuration class
Attributes
----------
hysteresis: int, default: 2
represents the delay shift in dynamic loss scaling
initial_scale_power: int, default: 32
power of the initial dynamic loss scale value. The actual loss scale is computed as 2 ** initial_scale_power
loss_scale: float, default: 0.0
loss scaling value for FP16 training (0.0 --> dynamic scaling)
loss_scale_window: int, default: 1000
the window over which to raise/lower the dynamic loss scale value
min_loss_scale: int, default: 1000
minimum dynamic loss scale value
"""
hysteresis: int = 2
initial_scale_power: int = 32
loss_scale: float = 0.0
loss_scale_window: int = 1000
min_loss_scale: int = 1000
DeepspeedFlopsConfig
class DeepspeedFlopsConfig(
detailed: bool = True,
module_depth: int = -1,
output_file: Union[str, NoneType] = None,
profile_step: int = 1,
top_modules: int = 1
)
Attributes
Name | Type | Description | Default |
---|---|---|---|
detailed | bool, default: True | Whether to print the detailed model profile | None |
module_depth | int, default: -1 | The depth of the model at which to print the aggregated module information. When set to -1, it prints | |
information from the top module to the innermost modules (the maximum depth). | None | ||
output_file | Optional[str], default: None | Path to the output file. If None, the profiler prints to stdout | None |
profile_step | int, default: 1 | The global training step at which to profile. | None |
top_modules | int, default: 1 | Limits the aggregated profile output to the number of top modules specified. | None |
??? example "View Source" class DeepspeedFlopsConfig:
"""Deepspeed flops profiler configuration class
Attributes
----------
detailed: bool, default: True
Whether to print the detailed model profile
module_depth: int, default: -1
The depth of the model at which to print the aggregated module information. When set to -1, it prints
information from the top module to the innermost modules (the maximum depth).
output_file: Optional[str], default: None
Path to the output file. If None, the profiler prints to stdout
profile_step: int, default: 1
The global training step at which to profile.
top_modules: int, default: 1
Limits the aggregated profile output to the number of top modules specified.
Notes
-----
Warm up steps are needed for accurate time measurement
"""
detailed: bool = True
module_depth: int = -1
output_file: Optional[str] = None
profile_step: int = 1
top_modules: int = 1
DeepspeedOffloadOptimizerConfig
class DeepspeedOffloadOptimizerConfig(
buffer_count: int = 4,
device: stoke.configs.OffloadDevice = 'cpu',
fast_init: bool = False,
nvme_path: str = '/local_nvme',
pin_memory: bool = False,
pipeline: bool = False,
pipeline_read: bool = False,
pipeline_write: bool = False
)
Attributes
Name | Type | Description | Default |
---|---|---|---|
buffer_count | int, default: 4 | Number of buffers in buffer pool for optimizer state offloading to NVMe. This should be at least the number | |
of states maintained per parameter by the optimizer. For example, Adam optimizer has 4 states (parameter, | |||
gradient, momentum, and variance). | None | ||
device | OffloadDevice, default: 'cpu' | Device memory to offload optimizer state | None |
fast_init | bool, default: False | Enable fast optimizer initialization when offloading to NVMe | None |
nvme_path | str, default: '/local_nvme' | Filesystem path for NVMe device for optimizer state offloading | None |
pin_memory | bool, default: False | Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead. | None |
pipeline | bool, default: False | pipeline activated (will default to True if either pipeline_read or pipeline_write is set | to |
pipeline_read | bool, default: False | activate pipeline read (deepspeed has limited docs for what this does) | None |
pipeline_write | bool, default: False | activate pipeline write(deepspeed has limited docs for what this does) | None |
??? example "View Source" class DeepspeedOffloadOptimizerConfig:
"""Deepspeed optimizer offloading configuration class
Attributes
----------
buffer_count: int, default: 4
Number of buffers in buffer pool for optimizer state offloading to NVMe. This should be at least the number
of states maintained per parameter by the optimizer. For example, Adam optimizer has 4 states (parameter,
gradient, momentum, and variance).
device: OffloadDevice, default: 'cpu'
Device memory to offload optimizer state
fast_init: bool, default: False
Enable fast optimizer initialization when offloading to NVMe
nvme_path: str, default: '/local_nvme'
Filesystem path for NVMe device for optimizer state offloading
pin_memory: bool, default: False
Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead.
pipeline: bool, default: False
pipeline activated (will default to True if either pipeline_read or pipeline_write is set
pipeline_read: bool, default: False
activate pipeline read (deepspeed has limited docs for what this does)
pipeline_write: bool, default: False
activate pipeline write(deepspeed has limited docs for what this does)
"""
buffer_count: int = 4
device: OffloadDevice = "cpu"
fast_init: bool = False
nvme_path: str = "/local_nvme"
pin_memory: bool = False
pipeline: bool = False
pipeline_read: bool = False
pipeline_write: bool = False
DeepspeedOffloadParamConfig
class DeepspeedOffloadParamConfig(
buffer_count: int = 5,
buffer_size: int = 100000000,
device: stoke.configs.OffloadDevice = 'cpu',
max_in_cpu: int = 1000000000,
nvme_path: str = '/local_nvme',
pin_memory: bool = False
)
Attributes
Name | Type | Description | Default |
---|---|---|---|
buffer_count | int, default: 5 | Number of buffers in buffer pool for parameter offloading to NVMe | None |
buffer_size | int, default: int(1E8) | Size of buffers in buffer pool for parameter offloading to NVMe | None |
device | OffloadDevice, default: 'cpu' | Device memory to offload model parameters | None |
max_in_cpu | int, default: int(1E9) | Number of parameter elements to maintain in CPU memory when offloading to NVMe is enabled. | None |
nvme_path | str, default: '/local_nvme' | Filesystem path for NVMe device for parameter offloading | None |
pin_memory | bool, default: False | Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead. | None |
??? example "View Source" class DeepspeedOffloadParamConfig:
"""Deepspeed parameter offloading configuration class
Attributes
----------
buffer_count: int, default: 5
Number of buffers in buffer pool for parameter offloading to NVMe
buffer_size: int, default: int(1E8)
Size of buffers in buffer pool for parameter offloading to NVMe
device: OffloadDevice, default: 'cpu'
Device memory to offload model parameters
max_in_cpu: int, default: int(1E9)
Number of parameter elements to maintain in CPU memory when offloading to NVMe is enabled.
nvme_path: str, default: '/local_nvme'
Filesystem path for NVMe device for parameter offloading
pin_memory: bool, default: False
Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead.
"""
buffer_count: int = 5
buffer_size: int = int(1e8)
device: OffloadDevice = "cpu"
max_in_cpu: int = int(1e9)
nvme_path: str = "/local_nvme"
pin_memory: bool = False
DeepspeedPLDConfig
class DeepspeedPLDConfig(
theta: float = 1.0,
gamma: float = 0.001
)
Attributes
Name | Type | Description | Default |
---|---|---|---|
theta | float, default: 1.0 | Hyper-parameter that controls the trade-off between training time and robustness. The lower the theta value, | |
the faster the training speed | None | ||
gamma | float, default: 0.001 | Hyper-parameter that controls how fast the drop ratio increases | None |
??? example "View Source" class DeepspeedPLDConfig:
"""
Attributes
----------
theta: float, default: 1.0
Hyper-parameter that controls the trade-off between training time and robustness. The lower the theta value,
the faster the training speed
gamma: float, default: 0.001
Hyper-parameter that controls how fast the drop ratio increases
"""
theta: float = 1.0
gamma: float = 0.001
DeepspeedTensorboardConfig
class DeepspeedTensorboardConfig(
output_path: str = '',
job_name: str = 'DeepSpeedJobName'
)
Attributes
Name | Type | Description | Default |
---|---|---|---|
output_path | str, default: '' | Tensorboard output path | None |
job_name | str, default: 'DeepSpeedJobName' | Tensorboard job name | None |
??? example "View Source" class DeepspeedTensorboardConfig:
"""Deepspeed Tensorboard configuration class
Attributes
----------
output_path: str, default: ''
Tensorboard output path
job_name: str, default: 'DeepSpeedJobName'
Tensorboard job name
"""
output_path: str = ""
job_name: str = "DeepSpeedJobName"
DeepspeedZeROConfig
class DeepspeedZeROConfig(
allgather_bucket_size: int = 500000000,
allgather_partitions: bool = True,
contiguous_gradients: bool = False,
ignore_unused_parameters: bool = True,
legacy_stage1: bool = False,
offload_optimizer: Union[stoke.configs.DeepspeedOffloadOptimizerConfig, NoneType] = None,
offload_param: Union[stoke.configs.DeepspeedOffloadParamConfig, NoneType] = None,
overlap_comm: bool = False,
reduce_bucket_size: int = 500000000,
reduce_scatter: bool = True,
stage: int = 0,
stage3_max_live_parameters: int = 1000000000,
stage3_max_reuse_distance: int = 1000000000,
stage3_prefetch_bucket_size: int = 500000000,
stage3_param_persistence_threshold: int = 1000000,
stage3_gather_fp16_weights_on_model_save: bool = False,
sub_group_size: int = 1000000000000
)
Attributes
Name | Type | Description | Default |
---|---|---|---|
allgather_bucket_size | int, default: int(5E8) | Number of elements allgathered at a time. Limits the memory required for the allgather for large model sizes | None |
allgather_partitions | bool, default: True | Chooses between allgather collective or a series of broadcast collectives to gather updated parameters | |
from all the GPUs at the end of each step | None | ||
contiguous_gradients | bool, default: False | Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward | |
pass. Only useful when running very large models. | None | ||
ignore_unused_parameters | bool, default: True | Now just used in stage2 complete_grad_norm_calculation_for_cpu_offload | |
Enable this option to avoid -- https://github.com/microsoft/DeepSpeed/issues/707 | None | ||
legacy_stage1 | bool, default: False | Use deepspeed < v0.3.17 zero stage 1, kept for backwards compatability reasons | None |
offload_optimizer | Optional[DeepspeedOffloadOptimizerConfig], default: None | Enable offloading of optimizer state to CPU or NVMe, and optimizer computation to CPU. This frees up GPU | |
memory for larger models or batch sizes. Valid only with stage 3 | None | ||
offload_param | Optional[DeepspeedOffloadParamConfig], default: None | Enable offloading of model parameters to CPU or NVMe. This frees up GPU memory for larger models or batch | |
sizes. Valid only with stage 3. | None | ||
overlap_comm | bool, default: False | Attempts to overlap the reduction of the gradients with backward computation | None |
reduce_bucket_size | int, default: int(5E8) | Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large | |
model sizes | None | ||
reduce_scatter | bool, default: True | Uses reduce or reduce scatter instead of allreduce to average gradients | None |
stage | int, default: 0 | Chooses different stages of ZeRO Optimizer. Stage 0, 1, 2, and 3 refer to disabled, optimizer state | |
partitioning, and optimizer+gradient state partitioning, and optimizer+gradient+parameter partitioning, | |||
respectively | None | ||
stage3_max_live_parameters | int, default: int(1E9) | The maximum number of parameters resident per GPU before releasing. Smaller values use less memory, but | |
perform more communication. | None | ||
stage3_max_reuse_distance | int, default: int(1E9) | Do not release a parameter if it will be reused within this threshold of parameters. Smaller values use less | |
memory, but perform more communication. | None | ||
stage3_prefetch_bucket_size | int, default: int(5E8) | The size of the fixed buffer for prefetching parameters. Smaller values use less memory, but can increase | |
stalls due to communication. | None | ||
stage3_param_persistence_threshold | int, default: int(1E6) | Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly | |
increase communication (especially latency-bound messages). | None | ||
stage3_gather_fp16_weights_on_model_save | bool, default: False | Consolidate the weights before saving the model by save_fp16_model(). Since the weights are partitioned | |
across GPUs, they aren’t part of state_dict, so this function automatically gather the weights when this | |||
option is enabled and then saves the fp16 model weights. | None | ||
sub_group_size | int, default: int(1E12) | sub_group_size controls the granularity in which parameters are updated during optimizer steps. Parameters are | |
grouped into buckets of sub_group_size and each buckets is updated one at a time. | None |
??? example "View Source" class DeepspeedZeROConfig:
"""Deepspeed ZeRO configuration class
Attributes
----------
allgather_bucket_size: int, default: int(5E8)
Number of elements allgathered at a time. Limits the memory required for the allgather for large model sizes
allgather_partitions: bool, default: True
Chooses between allgather collective or a series of broadcast collectives to gather updated parameters
from all the GPUs at the end of each step
contiguous_gradients: bool, default: False
Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward
pass. Only useful when running very large models.
ignore_unused_parameters: bool, default: True
Now just used in stage2 complete_grad_norm_calculation_for_cpu_offload
Enable this option to avoid -- https://github.com/microsoft/DeepSpeed/issues/707
legacy_stage1: bool, default: False
Use deepspeed < v0.3.17 zero stage 1, kept for backwards compatability reasons
offload_optimizer: Optional[DeepspeedOffloadOptimizerConfig], default: None
Enable offloading of optimizer state to CPU or NVMe, and optimizer computation to CPU. This frees up GPU
memory for larger models or batch sizes. Valid only with stage 3
offload_param: Optional[DeepspeedOffloadParamConfig], default: None
Enable offloading of model parameters to CPU or NVMe. This frees up GPU memory for larger models or batch
sizes. Valid only with stage 3.
overlap_comm: bool, default: False
Attempts to overlap the reduction of the gradients with backward computation
reduce_bucket_size: int, default: int(5E8)
Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large
model sizes
reduce_scatter: bool, default: True
Uses reduce or reduce scatter instead of allreduce to average gradients
stage: int, default: 0
Chooses different stages of ZeRO Optimizer. Stage 0, 1, 2, and 3 refer to disabled, optimizer state
partitioning, and optimizer+gradient state partitioning, and optimizer+gradient+parameter partitioning,
respectively
stage3_max_live_parameters: int, default: int(1E9)
The maximum number of parameters resident per GPU before releasing. Smaller values use less memory, but
perform more communication.
stage3_max_reuse_distance: int, default: int(1E9)
Do not release a parameter if it will be reused within this threshold of parameters. Smaller values use less
memory, but perform more communication.
stage3_prefetch_bucket_size: int, default: int(5E8)
The size of the fixed buffer for prefetching parameters. Smaller values use less memory, but can increase
stalls due to communication.
stage3_param_persistence_threshold: int, default: int(1E6)
Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly
increase communication (especially latency-bound messages).
stage3_gather_fp16_weights_on_model_save: bool, default: False
Consolidate the weights before saving the model by save_fp16_model(). Since the weights are partitioned
across GPUs, they aren’t part of state_dict, so this function automatically gather the weights when this
option is enabled and then saves the fp16 model weights.
sub_group_size: int, default: int(1E12)
sub_group_size controls the granularity in which parameters are updated during optimizer steps. Parameters are
grouped into buckets of sub_group_size and each buckets is updated one at a time.
"""
allgather_bucket_size: int = int(5e8)
allgather_partitions: bool = True
contiguous_gradients: bool = False
ignore_unused_parameters: bool = True
legacy_stage1: bool = False
offload_optimizer: Optional[DeepspeedOffloadOptimizerConfig] = None
offload_param: Optional[DeepspeedOffloadParamConfig] = None
overlap_comm: bool = False
reduce_bucket_size: int = int(5e8)
reduce_scatter: bool = True
stage: int = 0
stage3_max_live_parameters: int = int(1e9)
stage3_max_reuse_distance: int = int(1e9)
stage3_prefetch_bucket_size: int = int(5e8)
stage3_param_persistence_threshold: int = int(1e6)
stage3_gather_fp16_weights_on_model_save: bool = False
sub_group_size: int = int(1e12)
FairscaleFSDPConfig
class FairscaleFSDPConfig(
bucket_cap_mb: int = 25,
buffer_dtype: Union[torch.dtype, NoneType] = None,
clear_autocast_cache: bool = False,
compute_dtype: Union[torch.dtype, NoneType] = None,
flatten_parameters: bool = True,
force_input_to_fp32: bool = False,
fp32_reduce_scatter: bool = False,
gradient_predivide_factor: Union[float, NoneType] = None,
gradient_postdivide_factor: Union[float, NoneType] = None,
move_grads_to_cpu: Union[bool, NoneType] = None,
move_params_to_cpu: bool = False,
no_broadcast_optim_state: Union[bool, NoneType] = False,
reshard_after_forward: bool = True,
verbose: bool = False
)
Attributes
Name | Type | Description | Default |
---|---|---|---|
bucket_cap_mb | int, default: 25 | FSDP will bucket parameters so that gradient reduction can be more efficient for small parameters. | |
bucket_cap_mb controls the bucket size in MegaBytes (MB). Buckets are sub-divided based on world_size, so the | |||
max shard size is roughly bucket_cap_mb / world_size. There is one bucketer (with potentially multiple | |||
bucket_cap_mb sized buffers shared by all FSDP instances. Large gradient tensors are directly reduced without | |||
using the buffers. The buffers are there to reduce communication overhead for small tensors. Overlapping with | |||
computation happens due to use of a different CUDA stream than the computation CUDA stream. The total memory | |||
overhead per buffer is around bucket_cap_mb / world_size * (world_size + 1). The buffers are allocated during | |||
the backward pass and freed at the end of the backward pass to save more memory for other phases of the | |||
training process. Note, the memory vs. speed tradeoff of bucket size is very different from that of the DDP | |||
engine. In DDP, the buffer size 1MB + n*cap_mb, until n is big enough to cover the entire model size. The | |||
order of which buffer is ready there is more rigid and DDP requires all gradients to be computed in the | |||
backward. In FSDP, the buffer size does not change with model size (it changes based on number of | |||
call that ensures everything is reduced and not all gradients need to be upfront known. Overlapping with | |||
compute is done differently too. Values <= 0 disable bucketing | None | ||
buffer_dtype | Optional[torch.dtype], default: None | dtype for buffers for computation. defaults to value of compute_dtype | value |
clear_autocast_cache | bool, default: False | When using mixed precision training with FP16 AMP, if the model weights are in FP32, autocast | |
maintains a cache for downcasted weights. The cache can cause GPU OOM during the forward pass. Setting this | |||
flag to true will help clearing this cache as inner FSDP instances finish part of the forward pass to save | |||
GPU memory | None | ||
compute_dtype | Optional[torch.dtype], default: None | dtype for full parameters for computation. This defaults to torch.float32 unless FP 16 AMP is set, | |
in which case it defaults to torch.float16. | torch.float32 | ||
flatten_parameters | bool, default: True | flatten parameters into a single contiguous tensor, which improves training speed | None |
force_input_to_fp32 | bool, default: False: | force input floating point tensors to be FP32 (if they are FP16) when the FSDP instance is in full precision | |
mode. This helps avoid issues of running SyncBatchNorm with AMP and checkpoint_wrapper. | None | ||
fp32_reduce_scatter | bool, default: False | reduce-scatter gradients in FP32. This is only relevant when FP16 AMP is used | None |
gradient_predivide_factor | Optional[float], default: None | divide factor before the reduction | None |
gradient_postdivide_factor | Optional[float], default: None | divide factor after the reduction | None |
move_grads_to_cpu | Optional[bool], default: None | move gradient shard to CPU after reduction. This is only relevant when FP16 AMP is used | None |
move_params_to_cpu | bool, default: False | offload FP32 params to CPU. This is only relevant when FP16 AMP is used | None |
no_broadcast_optim_state | Optional[bool], default: False | do not broadcast this modules optimizer state when gather_full_optim_state_dict is called. If you set this | |
true, you are expected to overwrite the relevant state entries of the returned optimizer state dict with the | |||
proper state at each rank. This is useful for situations, like Mixture Of Experts, where all but a few | |||
parameters can fit on one node | None | ||
reshard_after_forward | bool, default: True | reshard parameters after the forward pass. This saves memory but slows training. This is only relevant | |
when resharding individual layers (see https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html) | None | ||
verbose | bool, default: True | turn on verbose output for model’s string representation | None |
??? example "View Source" class FairscaleFSDPConfig:
"""Fairscale Fully Sharded Data Parallel configuration class
Attributes
----------
bucket_cap_mb: int, default: 25
FSDP will bucket parameters so that gradient reduction can be more efficient for small parameters.
bucket_cap_mb controls the bucket size in MegaBytes (MB). Buckets are sub-divided based on world_size, so the
max shard size is roughly bucket_cap_mb / world_size. There is one bucketer (with potentially multiple
bucket_cap_mb sized buffers shared by all FSDP instances. Large gradient tensors are directly reduced without
using the buffers. The buffers are there to reduce communication overhead for small tensors. Overlapping with
computation happens due to use of a different CUDA stream than the computation CUDA stream. The total memory
overhead per buffer is around bucket_cap_mb / world_size * (world_size + 1). The buffers are allocated during
the backward pass and freed at the end of the backward pass to save more memory for other phases of the
training process. Note, the memory vs. speed tradeoff of bucket size is very different from that of the DDP
engine. In DDP, the buffer size 1MB + n*cap_mb, until n is big enough to cover the entire model size. The
order of which buffer is ready there is more rigid and DDP requires all gradients to be computed in the
backward. In FSDP, the buffer size does not change with model size (it changes based on number of
<dtype, device, process_group> tuples) and gradient ready order matters little since FSDP has a final flush
call that ensures everything is reduced and not all gradients need to be upfront known. Overlapping with
compute is done differently too. Values <= 0 disable bucketing
buffer_dtype: Optional[torch.dtype], default: None
dtype for buffers for computation. defaults to value of compute_dtype
clear_autocast_cache: bool, default: False
When using mixed precision training with FP16 AMP, if the model weights are in FP32, autocast
maintains a cache for downcasted weights. The cache can cause GPU OOM during the forward pass. Setting this
flag to true will help clearing this cache as inner FSDP instances finish part of the forward pass to save
GPU memory
compute_dtype: Optional[torch.dtype], default: None
dtype for full parameters for computation. This defaults to torch.float32 unless FP 16 AMP is set,
in which case it defaults to torch.float16.
flatten_parameters: bool, default: True
flatten parameters into a single contiguous tensor, which improves training speed
force_input_to_fp32: bool, default: False:
force input floating point tensors to be FP32 (if they are FP16) when the FSDP instance is in full precision
mode. This helps avoid issues of running SyncBatchNorm with AMP and checkpoint_wrapper.
fp32_reduce_scatter: bool, default: False
reduce-scatter gradients in FP32. This is only relevant when FP16 AMP is used
gradient_predivide_factor: Optional[float], default: None
divide factor before the reduction
gradient_postdivide_factor: Optional[float], default: None
divide factor after the reduction
move_grads_to_cpu: Optional[bool], default: None
move gradient shard to CPU after reduction. This is only relevant when FP16 AMP is used
move_params_to_cpu: bool, default: False
offload FP32 params to CPU. This is only relevant when FP16 AMP is used
no_broadcast_optim_state: Optional[bool], default: False
do not broadcast this modules optimizer state when gather_full_optim_state_dict is called. If you set this
true, you are expected to overwrite the relevant state entries of the returned optimizer state dict with the
proper state at each rank. This is useful for situations, like Mixture Of Experts, where all but a few
parameters can fit on one node
reshard_after_forward: bool, default: True
reshard parameters after the forward pass. This saves memory but slows training. This is only relevant
when resharding individual layers (see https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html)
verbose: bool, default: True
turn on verbose output for model’s string representation
Notes
-----
mixed_precision: bool
This value will automatically be set from the Stoke FP16 selected option (AMP only)
state_dict_device: torch.device
this is not exposed as it should be managed internally from the DDP backend setup
compute_device: torch.device
this is not exposed as it should be managed internally from the DDP backend setup
"""
bucket_cap_mb: int = 25
buffer_dtype: Optional[torch.dtype] = None
clear_autocast_cache: bool = False
compute_dtype: Optional[torch.dtype] = None
flatten_parameters: bool = True
force_input_to_fp32: bool = False
fp32_reduce_scatter: bool = False
gradient_predivide_factor: Optional[float] = None
gradient_postdivide_factor: Optional[float] = None
move_grads_to_cpu: Optional[bool] = None
move_params_to_cpu: bool = False
no_broadcast_optim_state: Optional[bool] = False
reshard_after_forward: bool = True
verbose: bool = False
Descendants
- stoke.extensions._FairscaleFSDPConfig
FairscaleOSSConfig
class FairscaleOSSConfig(
broadcast_fp16: bool = False
)
Attributes
Name | Type | Description | Default |
---|---|---|---|
broadcast_fp16 | bool, default: False | Compress the model shards in fp16 before sharing them in between ranks. This is safe to use when PyTorch AMP | |
is activated. Without torch AMP this will lead to a slight degradation in terms of accuracy. | None |
??? example "View Source" class FairscaleOSSConfig:
"""Fairscale optimizer state sharding configuration class
Attributes
----------
broadcast_fp16: bool, default: False
Compress the model shards in fp16 before sharing them in between ranks. This is safe to use when PyTorch AMP
is activated. Without torch AMP this will lead to a slight degradation in terms of accuracy.
"""
broadcast_fp16: bool = False
FairscaleSDDPConfig
class FairscaleSDDPConfig(
auto_refresh_trainable: bool = True,
broadcast_buffers: bool = True,
reduce_buffer_size: int = 8388608,
reduce_fp16: bool = False,
sync_models_at_startup: bool = True
)
Attributes
Name | Type | Description | Default |
---|---|---|---|
auto_refresh_trainable | bool, default: True | Check whether the parameters trainability (requires_grad) has changed and update both ShardedDDP and OSS | |
automatically if this is the case. If set to False, refresh_trainable() needs to be called anytime a | |||
parameter is frozen or unfrozen | None | ||
broadcast_buffers | bool, default: True | Whether to additionally broadcast model buffers in between ranks at the beginning of each forward pass. Same | |
setting as in Pytorch DDP, this is in addition to the broadcast and reduction of the model parameters. | None | ||
reduce_buffer_size | int, default: 2 ** 23 | he max size of the buffer used to batch the small parameter tensors, in number of elements. This will impact | |
the long term memory consumption, because these buckets correspond to parameters which will not be sharded. | |||
Set to 0 to remove all bucketing, 1M to 8M is usually reasonable. | None | ||
reduce_fp16 | bool, default: False | cast the grads to fp16 before reducing. Not needed if the model is already fp16, but will probably improve | |
performance for multi node jobs using PyTorch AMP. The effect is similar to DDP’s fp16_compress_hook and | |||
will also save some memory. | None | ||
sync_models_at_startup | bool, default: True | Synchronize the models in between the ranks when starting up. Not needed if each rank has the same seed, or | |
the training restarts from a saved state | None |
??? example "View Source" class FairscaleSDDPConfig:
"""Fairscale sharded data parallel (SDDP) configuration class
Attributes
----------
auto_refresh_trainable: bool, default: True
Check whether the parameters trainability (requires_grad) has changed and update both ShardedDDP and OSS
automatically if this is the case. If set to False, refresh_trainable() needs to be called anytime a
parameter is frozen or unfrozen
broadcast_buffers: bool, default: True
Whether to additionally broadcast model buffers in between ranks at the beginning of each forward pass. Same
setting as in Pytorch DDP, this is in addition to the broadcast and reduction of the model parameters.
reduce_buffer_size: int, default: 2 ** 23
he max size of the buffer used to batch the small parameter tensors, in number of elements. This will impact
the long term memory consumption, because these buckets correspond to parameters which will not be sharded.
Set to 0 to remove all bucketing, 1M to 8M is usually reasonable.
reduce_fp16: bool, default: False
cast the grads to fp16 before reducing. Not needed if the model is already fp16, but will probably improve
performance for multi node jobs using PyTorch AMP. The effect is similar to DDP’s fp16_compress_hook and
will also save some memory.
sync_models_at_startup: bool, default: True
Synchronize the models in between the ranks when starting up. Not needed if each rank has the same seed, or
the training restarts from a saved state
"""
auto_refresh_trainable: bool = True
broadcast_buffers: bool = True
reduce_buffer_size: int = 2 ** 23
reduce_fp16: bool = False
sync_models_at_startup: bool = True
HorovodConfig
class HorovodConfig(
compression: bool = False,
convert_to_sync_batch_norm: bool = False,
gradient_predivide_factor: float = 1.0,
op: stoke.configs.HorovodOps = 'Average'
)
Attributes
Name | Type | Description | Default |
---|---|---|---|
compression | bool, default: False | Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter | |
update step. | None | ||
convert_to_sync_batch_norm | bool, default: False | Automatically convert all batch norm calls to horovod.torch.SyncBatchNorm calls | |
https://horovod.readthedocs.io/en/stable/api.html#horovod.torch.SyncBatchNorm | None | ||
gradient_predivide_factor | float, default: 1.0 | If op == Average, gradient_predivide_factor splits the averaging before and after the sum. Gradients are scaled | |
by 1.0 / gradient_predivide_factor before the sum and gradient_predivide_factor / size after the sum. | None | ||
op | HorovodOps, default: 'Average' | The reduction operation to use when combining gradients across different ranks. | None |
??? example "View Source" class HorovodConfig:
"""Horovod configuration class
Attributes
----------
compression: bool, default: False
Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter
update step.
convert_to_sync_batch_norm: bool, default: False
Automatically convert all batch norm calls to horovod.torch.SyncBatchNorm calls
https://horovod.readthedocs.io/en/stable/api.html#horovod.torch.SyncBatchNorm
gradient_predivide_factor: float, default: 1.0
If op == Average, gradient_predivide_factor splits the averaging before and after the sum. Gradients are scaled
by 1.0 / gradient_predivide_factor before the sum and gradient_predivide_factor / size after the sum.
op: HorovodOps, default: 'Average'
The reduction operation to use when combining gradients across different ranks.
"""
compression: bool = False
convert_to_sync_batch_norm: bool = False
gradient_predivide_factor: float = 1.0
op: HorovodOps = "Average"
HorovodOps
class HorovodOps(
/,
*args,
**kwargs
)
??? example "View Source" class HorovodOps(Enum):
"""Horovod ops options"""
Average = "Average"
Sum = "Sum"
Adasum = "Adasum"
Ancestors (in MRO)
- enum.Enum
Class variables
Adasum
Average
Sum
name
value
OffloadDevice
class OffloadDevice(
/,
*args,
**kwargs
)
??? example "View Source" class OffloadDevice(Enum):
"""Offload device options"""
none = "none"
cpu = "cpu"
nvme = "nvme"
Ancestors (in MRO)
- enum.Enum
Class variables
cpu
name
none
nvme
value
StokeOptimizer
class StokeOptimizer(
/,
*args,
**kwargs
)
Attributes
Name | Type | Description | Default |
---|---|---|---|
optimizer | Type[torch.optim.Optimizer] | un-instantiated torch.optim.Optimizer class | None |
optimizer_kwargs | Dict | any keyword args to be unrolled into the optimizer at instantiation time | None |
??? example "View Source" class StokeOptimizer(TypedDict):
"""Stoke optimizer wrapper class
Given all the different backends and extensions the optimizer might need to be instantiated in a different way
thus this typed dict holds the configuration without instantiation
Attributes
----------
optimizer: Type[torch.optim.Optimizer]
un-instantiated torch.optim.Optimizer class
optimizer_kwargs: Dict
any keyword args to be unrolled into the optimizer at instantiation time
"""
optimizer: Type[torch.optim.Optimizer]
optimizer_kwargs: Dict
Ancestors (in MRO)
- builtins.dict
Methods
clear
def clear(
...
)
D.clear() -> None. Remove all items from D.
copy
def copy(
...
)
D.copy() -> a shallow copy of D
fromkeys
def fromkeys(
iterable,
value=None,
/
)
Create a new dictionary with keys from iterable and values set to value.
get
def get(
self,
key,
default=None,
/
)
Return the value for key if key is in the dictionary, else default.
items
def items(
...
)
D.items() -> a set-like object providing a view on D's items
keys
def keys(
...
)
D.keys() -> a set-like object providing a view on D's keys
pop
def pop(
...
)
D.pop(k[,d]) -> v, remove specified key and return the corresponding value.
If key is not found, d is returned if given, otherwise KeyError is raised
popitem
def popitem(
self,
/
)
Remove and return a (key, value) pair as a 2-tuple.
Pairs are returned in LIFO (last-in, first-out) order. Raises KeyError if the dict is empty.
setdefault
def setdefault(
self,
key,
default=None,
/
)
Insert key with a value of default if key is not in the dictionary.
Return the value for key if key is in the dictionary, else default.
update
def update(
...
)
D.update([E, ]**F) -> None. Update D from dict/iterable E and F.
If E is present and has a .keys() method, then does: for k in E: D[k] = E[k] If E is present and lacks a .keys() method, then does: for k, v in E: D[k] = v In either case, this is followed by: for k in F: D[k] = F[k]
values
def values(
...
)
D.values() -> an object providing a view on D's values