diff --git a/bitsandbytes/optim/adamw.py b/bitsandbytes/optim/adamw.py index a32394bd5..5f225c9ad 100644 --- a/bitsandbytes/optim/adamw.py +++ b/bitsandbytes/optim/adamw.py @@ -26,7 +26,7 @@ def __init__( Base AdamW optimizer. Arguments: - params (`torch.tensor`): + params (`torch.Tensor`): The input parameters to optimize. lr (`float`, defaults to 1e-3): The learning rate. @@ -87,7 +87,7 @@ def __init__( 8-bit AdamW optimizer. Arguments: - params (`torch.tensor`): + params (`torch.Tensor`): The input parameters to optimize. lr (`float`, defaults to 1e-3): The learning rate. @@ -159,7 +159,7 @@ def __init__( 32-bit AdamW optimizer. Arguments: - params (`torch.tensor`): + params (`torch.Tensor`): The input parameters to optimize. lr (`float`, defaults to 1e-3): The learning rate. @@ -219,7 +219,7 @@ def __init__( Paged AdamW optimizer. Arguments: - params (`torch.tensor`): + params (`torch.Tensor`): The input parameters to optimize. lr (`float`, defaults to 1e-3): The learning rate. @@ -241,8 +241,6 @@ def __init__( Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. block_wise (`bool`, defaults to `True`): Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. - is_paged (`bool`, defaults to `False`): - Whether the optimizer is a paged optimizer or not. """ super().__init__( "adam", @@ -279,7 +277,7 @@ def __init__( Paged 8-bit AdamW optimizer. Arguments: - params (`torch.tensor`): + params (`torch.Tensor`): The input parameters to optimize. lr (`float`, defaults to 1e-3): The learning rate. @@ -303,8 +301,6 @@ def __init__( Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. block_wise (`bool`, defaults to `True`): Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. - is_paged (`bool`, defaults to `False`): - Whether the optimizer is a paged optimizer or not. """ # Validate unsupported parameters if amsgrad: @@ -350,7 +346,7 @@ def __init__( Paged 32-bit AdamW optimizer. Arguments: - params (`torch.tensor`): + params (`torch.Tensor`): The input parameters to optimize. lr (`float`, defaults to 1e-3): The learning rate. @@ -372,8 +368,6 @@ def __init__( Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. block_wise (`bool`, defaults to `True`): Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. - is_paged (`bool`, defaults to `False`): - Whether the optimizer is a paged optimizer or not. """ super().__init__( "adam",