popt.update_schemes.optimizers

  1import numpy as np
  2
  3
  4class GradientAscent:
  5    r"""
  6    A class for performing gradient ascent optimization with momentum and backtracking.
  7    The gradient descent update equation with momentum is given by:
  8
  9    .. math::
 10        \begin{align}
 11            v_t &= \beta * v_{t-1} + \alpha * gradient \\\
 12            x_t &= x_{t-1} - v_t
 13        \end{align}
 14
 15
 16    Attributes
 17    -----------------------------------------------------------------------------------
 18    step_size : float
 19        The initial step size provided during initialization.
 20
 21    momentum : float
 22        The initial momentum factor provided during initialization.
 23
 24    velocity : array_like
 25        Current velocity of the optimization process.
 26
 27    temp_velocity : array_like
 28        Temporary velocity
 29
 30    _step_size : float
 31        Private attribute for temporarily modifying step size.
 32
 33    _momentum : float
 34        Private attribute for temporarily modifying momentum.
 35
 36    Methods
 37    -----------------------------------------------------------------------------------
 38    apply_update(control, gradient, **kwargs):
 39        Apply a gradient update to the control parameter.
 40
 41    apply_backtracking():
 42        Apply backtracking by reducing step size and momentum temporarily.
 43
 44    restore_parameters():
 45        Restore the original step size and momentum values.
 46    """
 47
 48    def __init__(self, step_size, momentum):
 49        r"""
 50        Parameters
 51        ----------
 52        step_size : float
 53            The step size (learning rate) for the gradient ascent.
 54
 55        momentum : float
 56            The momentum factor to apply during updates.
 57        """
 58
 59        self.step_size = step_size
 60        self.momentum  = momentum
 61        self.velocity  = 0
 62
 63        self.temp_velocity = 0
 64        self._step_size    = step_size
 65        self._momentum     = momentum
 66    
 67
 68    def apply_update(self, control, gradient, **kwargs):
 69        """
 70        Apply a gradient update to the control parameter.
 71        NOTE: This is the steepest decent update: x_new = x_old - x_step.
 72
 73        Parameters
 74        -------------------------------------------------------------------------------------
 75        control : array_like
 76            The current value of the parameter being optimized.
 77
 78        gradient : array_like
 79            The gradient of the objective function with respect to the control parameter.
 80
 81        **kwargs : dict
 82            Additional keyword arguments.
 83
 84        Returns
 85        -------------------------------------------------------------------------------------
 86        new_control, temp_velocity: tuple
 87            The new value of the control parameter after the update, and the current state step.
 88        """
 89        alpha = self._step_size
 90        beta  = self._momentum
 91
 92        # apply update
 93        self.temp_velocity = beta*self.velocity - alpha*gradient
 94        new_control   = control + self.temp_velocity
 95        return new_control, self.temp_velocity
 96
 97    def apply_smc_update(self, control, gradient, **kwargs):
 98        """
 99        Apply a gradient update to the control parameter.
100
101        Parameters
102        -------------------------------------------------------------------------------------
103        control : array_like
104            The current value of the parameter being optimized.
105
106        gradient : array_like
107            The gradient of the objective function with respect to the control parameter.
108
109        **kwargs : dict
110            Additional keyword arguments.
111
112        Returns
113        -------------------------------------------------------------------------------------
114        new_control: numpy.ndarray
115            The new value of the control parameter after the update.
116        """
117        alpha = self._step_size
118
119        # apply update
120        new_control = (1-alpha) * control + alpha * gradient
121        return new_control
122
123    def apply_backtracking(self):
124        """
125        Apply backtracking by reducing step size and momentum temporarily.
126        """
127        self._step_size = 0.5*self._step_size
128        self._momentum  = 0.5*self._momentum
129    
130    def restore_parameters(self):
131        """
132        Restore the original step size and momentum value.
133        """
134        self.velocity   = self.temp_velocity
135        self._step_size = self.step_size
136        self._momentum  = self.momentum
137    
138    def get_momentum_for_nesterov(self):
139        return self.momentum * self.velocity
140
141    def get_step_size(self):
142        return self._step_size
143
144
145class Adam:
146    """
147    A class implementing the Adam optimizer for gradient-based optimization.
148    The Adam update equation for the control x using gradient g,
149    iteration t, and small constants ε is given by:
150
151        m_t = β1 * m_{t-1} + (1 - β1) * g   \n
152        v_t = β2 * v_{t-1} + (1 - β2) * g^2 \n
153        m_t_hat = m_t / (1 - β1^t)          \n
154        v_t_hat = v_t / (1 - β2^t)          \n
155        x_{t+1} = x_t - α * m_t_hat / (sqrt(v_t_hat) + ε)
156
157    Attributes
158    -------------------------------------------------------------------------------------
159    step_size : float
160        The initial step size provided during initialization.
161
162    beta1 : float
163        The exponential decay rate for the first moment estimates.
164
165    beta2 : float
166        The exponential decay rate for the second moment estimates.
167
168    vel1 : 1-D array_like
169        First moment estimate.
170
171    vel2 : 1-D array_like
172        Second moment estimate.
173
174    eps : float
175        Small constant to prevent division by zero.
176
177    _step_size : float
178        Private attribute for temporarily modifying step size.
179
180    temp_vel1 : 1-D array_like
181        Temporary first moment estimate.
182
183    temp_vel2 : 1-D array_like
184        Temporary Second moment estimate.
185
186    Methods
187    -------------------------------------------------------------------------------------
188    apply_update(control, gradient, **kwargs):
189        Apply an Adam update to the control parameter.
190
191    apply_backtracking():
192        Apply backtracking by reducing step size temporarily.
193
194    restore_parameters():
195        Restore the original step size.
196
197    References
198    -------------------------------------------------------------------------------------
199    [1] Kingma, D. P., & Ba, J. (2014).
200        Adam: A Method for Stochastic Optimization.
201        arXiv preprint arXiv:1412.6980.
202    """
203
204    def __init__(self, step_size, beta1=0.9, beta2=0.999):
205        """
206        A class implementing the Adam optimizer for gradient-based optimization.
207        The Adam update equation for the control x using gradient g, 
208        iteration t, and small constants ε is given by:
209
210            m_t = β1 * m_{t-1} + (1 - β1) * g   \n
211            v_t = β2 * v_{t-1} + (1 - β2) * g^2 \n
212            m_t_hat = m_t / (1 - β1^t)          \n
213            v_t_hat = v_t / (1 - β2^t)          \n
214            x_{t+1} = x_t - α * m_t_hat / (sqrt(v_t_hat) + ε)
215
216        Parameters
217        -------------------------------------------------------------------------------------
218        step_size : float
219            The step size (learning rate) for the optimization.
220
221        beta1 : float, optional
222            The exponential decay rate for the first moment estimates (default is 0.9).
223
224        beta2 : float, optional
225            The exponential decay rate for the second moment estimates (default is 0.999).
226
227        """
228        self.step_size = step_size
229        self.beta1 = beta1
230        self.beta2 = beta2
231        self.vel1  = 0
232        self.vel2  = 0
233        self.eps   = 1e-7
234
235        self._step_size = step_size
236        self.temp_vel1 = 0
237        self.temp_vel2 = 0
238
239    def apply_update(self, control, gradient, **kwargs):
240        """
241        Apply a gradient update to the control parameter.
242        NOTE: This is the steepest decent update: x_new = x_old - x_step.
243
244        Parameters
245        -------------------------------------------------------------------------------------
246        control : array_like
247            The current value of the parameter being optimized.
248
249        gradient : array_like
250            The gradient of the objective function with respect to the control parameter.
251
252        **kwargs : dict
253            Additional keyword arguments, including 'iter' for the current iteration.
254
255        Returns
256        -------------------------------------------------------------------------------------
257        new_control, temp_velocity: tuple
258            The new value of the control parameter after the update, and the current state step.
259        """
260        iter  = kwargs['iter'] 
261        alpha = self._step_size
262        beta1 = self.beta1
263        beta2 = self.beta2
264
265        self.temp_vel1 = beta1*self.vel1 + (1-beta1)*gradient
266        self.temp_vel2 = beta2*self.vel2 + (1-beta2)*gradient**2
267        vel1_hat  = self.temp_vel1/(1-beta1**iter)
268        vel2_hat  = self.temp_vel2/(1-beta2**iter)
269
270        step = alpha*vel1_hat/(np.sqrt(vel2_hat)+self.eps)
271        new_control = control - step  # steepest decent
272        return new_control, step
273
274    def apply_backtracking(self):
275        """
276        Apply backtracking by reducing step size temporarily.
277        """
278        self._step_size = 0.5*self._step_size
279    
280    def restore_parameters(self):
281        """
282        Restore the original step size.
283        """
284        self.vel1 = self.temp_vel1
285        self.vel2 = self.temp_vel2
286        self._step_size = self.step_size
287
288    def get_step_size(self):
289        return self._step_size
290
291
292class AdaMax(Adam):
293    '''
294    AdaMax optimizer
295    
296    References
297    -------------------------------------------------------------------------------------
298    [1] Kingma, D. P., & Ba, J. (2014).
299        Adam: A Method for Stochastic Optimization.
300        arXiv preprint arXiv:1412.6980.
301    '''
302    def __init__(self, step_size, beta1=0.9, beta2=0.999):
303        super().__init__(step_size, beta1, beta2)
304    
305    def apply_update(self, control, gradient, **kwargs):
306        iter  = kwargs['iter'] 
307        alpha = self._step_size
308        beta1 = self.beta1
309        beta2 = self.beta2
310
311        self.temp_vel1 = beta1*self.vel1 + (1-beta1)*gradient
312        self.temp_vel2 = np.maximum(beta2*self.vel2, np.abs(gradient))
313        
314        step = alpha/(1-beta1**iter) * self.temp_vel1/self.temp_vel2
315        new_control = control - step 
316        return new_control, step
317    
class GradientAscent:
  5class GradientAscent:
  6    r"""
  7    A class for performing gradient ascent optimization with momentum and backtracking.
  8    The gradient descent update equation with momentum is given by:
  9
 10    .. math::
 11        \begin{align}
 12            v_t &= \beta * v_{t-1} + \alpha * gradient \\\
 13            x_t &= x_{t-1} - v_t
 14        \end{align}
 15
 16
 17    Attributes
 18    -----------------------------------------------------------------------------------
 19    step_size : float
 20        The initial step size provided during initialization.
 21
 22    momentum : float
 23        The initial momentum factor provided during initialization.
 24
 25    velocity : array_like
 26        Current velocity of the optimization process.
 27
 28    temp_velocity : array_like
 29        Temporary velocity
 30
 31    _step_size : float
 32        Private attribute for temporarily modifying step size.
 33
 34    _momentum : float
 35        Private attribute for temporarily modifying momentum.
 36
 37    Methods
 38    -----------------------------------------------------------------------------------
 39    apply_update(control, gradient, **kwargs):
 40        Apply a gradient update to the control parameter.
 41
 42    apply_backtracking():
 43        Apply backtracking by reducing step size and momentum temporarily.
 44
 45    restore_parameters():
 46        Restore the original step size and momentum values.
 47    """
 48
 49    def __init__(self, step_size, momentum):
 50        r"""
 51        Parameters
 52        ----------
 53        step_size : float
 54            The step size (learning rate) for the gradient ascent.
 55
 56        momentum : float
 57            The momentum factor to apply during updates.
 58        """
 59
 60        self.step_size = step_size
 61        self.momentum  = momentum
 62        self.velocity  = 0
 63
 64        self.temp_velocity = 0
 65        self._step_size    = step_size
 66        self._momentum     = momentum
 67    
 68
 69    def apply_update(self, control, gradient, **kwargs):
 70        """
 71        Apply a gradient update to the control parameter.
 72        NOTE: This is the steepest decent update: x_new = x_old - x_step.
 73
 74        Parameters
 75        -------------------------------------------------------------------------------------
 76        control : array_like
 77            The current value of the parameter being optimized.
 78
 79        gradient : array_like
 80            The gradient of the objective function with respect to the control parameter.
 81
 82        **kwargs : dict
 83            Additional keyword arguments.
 84
 85        Returns
 86        -------------------------------------------------------------------------------------
 87        new_control, temp_velocity: tuple
 88            The new value of the control parameter after the update, and the current state step.
 89        """
 90        alpha = self._step_size
 91        beta  = self._momentum
 92
 93        # apply update
 94        self.temp_velocity = beta*self.velocity - alpha*gradient
 95        new_control   = control + self.temp_velocity
 96        return new_control, self.temp_velocity
 97
 98    def apply_smc_update(self, control, gradient, **kwargs):
 99        """
100        Apply a gradient update to the control parameter.
101
102        Parameters
103        -------------------------------------------------------------------------------------
104        control : array_like
105            The current value of the parameter being optimized.
106
107        gradient : array_like
108            The gradient of the objective function with respect to the control parameter.
109
110        **kwargs : dict
111            Additional keyword arguments.
112
113        Returns
114        -------------------------------------------------------------------------------------
115        new_control: numpy.ndarray
116            The new value of the control parameter after the update.
117        """
118        alpha = self._step_size
119
120        # apply update
121        new_control = (1-alpha) * control + alpha * gradient
122        return new_control
123
124    def apply_backtracking(self):
125        """
126        Apply backtracking by reducing step size and momentum temporarily.
127        """
128        self._step_size = 0.5*self._step_size
129        self._momentum  = 0.5*self._momentum
130    
131    def restore_parameters(self):
132        """
133        Restore the original step size and momentum value.
134        """
135        self.velocity   = self.temp_velocity
136        self._step_size = self.step_size
137        self._momentum  = self.momentum
138    
139    def get_momentum_for_nesterov(self):
140        return self.momentum * self.velocity
141
142    def get_step_size(self):
143        return self._step_size

A class for performing gradient ascent optimization with momentum and backtracking. The gradient descent update equation with momentum is given by:

$$\begin{align} v_t &= \beta * v_{t-1} + \alpha * gradient \\ x_t &= x_{t-1} - v_t \end{align}$$

Attributes
  • step_size (float): The initial step size provided during initialization.
  • momentum (float): The initial momentum factor provided during initialization.
  • velocity (array_like): Current velocity of the optimization process.
  • temp_velocity (array_like): Temporary velocity
  • _step_size (float): Private attribute for temporarily modifying step size.
  • _momentum (float): Private attribute for temporarily modifying momentum.
Methods

apply_update(control, gradient, **kwargs): Apply a gradient update to the control parameter.

apply_backtracking(): Apply backtracking by reducing step size and momentum temporarily.

restore_parameters(): Restore the original step size and momentum values.

GradientAscent(step_size, momentum)
49    def __init__(self, step_size, momentum):
50        r"""
51        Parameters
52        ----------
53        step_size : float
54            The step size (learning rate) for the gradient ascent.
55
56        momentum : float
57            The momentum factor to apply during updates.
58        """
59
60        self.step_size = step_size
61        self.momentum  = momentum
62        self.velocity  = 0
63
64        self.temp_velocity = 0
65        self._step_size    = step_size
66        self._momentum     = momentum
Parameters
  • step_size (float): The step size (learning rate) for the gradient ascent.
  • momentum (float): The momentum factor to apply during updates.
step_size
momentum
velocity
temp_velocity
def apply_update(self, control, gradient, **kwargs):
69    def apply_update(self, control, gradient, **kwargs):
70        """
71        Apply a gradient update to the control parameter.
72        NOTE: This is the steepest decent update: x_new = x_old - x_step.
73
74        Parameters
75        -------------------------------------------------------------------------------------
76        control : array_like
77            The current value of the parameter being optimized.
78
79        gradient : array_like
80            The gradient of the objective function with respect to the control parameter.
81
82        **kwargs : dict
83            Additional keyword arguments.
84
85        Returns
86        -------------------------------------------------------------------------------------
87        new_control, temp_velocity: tuple
88            The new value of the control parameter after the update, and the current state step.
89        """
90        alpha = self._step_size
91        beta  = self._momentum
92
93        # apply update
94        self.temp_velocity = beta*self.velocity - alpha*gradient
95        new_control   = control + self.temp_velocity
96        return new_control, self.temp_velocity

Apply a gradient update to the control parameter. NOTE: This is the steepest decent update: x_new = x_old - x_step.

Parameters
  • control (array_like): The current value of the parameter being optimized.
  • gradient (array_like): The gradient of the objective function with respect to the control parameter.
  • **kwargs (dict): Additional keyword arguments.
Returns
  • new_control, temp_velocity (tuple): The new value of the control parameter after the update, and the current state step.
def apply_smc_update(self, control, gradient, **kwargs):
 98    def apply_smc_update(self, control, gradient, **kwargs):
 99        """
100        Apply a gradient update to the control parameter.
101
102        Parameters
103        -------------------------------------------------------------------------------------
104        control : array_like
105            The current value of the parameter being optimized.
106
107        gradient : array_like
108            The gradient of the objective function with respect to the control parameter.
109
110        **kwargs : dict
111            Additional keyword arguments.
112
113        Returns
114        -------------------------------------------------------------------------------------
115        new_control: numpy.ndarray
116            The new value of the control parameter after the update.
117        """
118        alpha = self._step_size
119
120        # apply update
121        new_control = (1-alpha) * control + alpha * gradient
122        return new_control

Apply a gradient update to the control parameter.

Parameters
  • control (array_like): The current value of the parameter being optimized.
  • gradient (array_like): The gradient of the objective function with respect to the control parameter.
  • **kwargs (dict): Additional keyword arguments.
Returns
  • new_control (numpy.ndarray): The new value of the control parameter after the update.
def apply_backtracking(self):
124    def apply_backtracking(self):
125        """
126        Apply backtracking by reducing step size and momentum temporarily.
127        """
128        self._step_size = 0.5*self._step_size
129        self._momentum  = 0.5*self._momentum

Apply backtracking by reducing step size and momentum temporarily.

def restore_parameters(self):
131    def restore_parameters(self):
132        """
133        Restore the original step size and momentum value.
134        """
135        self.velocity   = self.temp_velocity
136        self._step_size = self.step_size
137        self._momentum  = self.momentum

Restore the original step size and momentum value.

def get_momentum_for_nesterov(self):
139    def get_momentum_for_nesterov(self):
140        return self.momentum * self.velocity
def get_step_size(self):
142    def get_step_size(self):
143        return self._step_size
class Adam:
146class Adam:
147    """
148    A class implementing the Adam optimizer for gradient-based optimization.
149    The Adam update equation for the control x using gradient g,
150    iteration t, and small constants ε is given by:
151
152        m_t = β1 * m_{t-1} + (1 - β1) * g   \n
153        v_t = β2 * v_{t-1} + (1 - β2) * g^2 \n
154        m_t_hat = m_t / (1 - β1^t)          \n
155        v_t_hat = v_t / (1 - β2^t)          \n
156        x_{t+1} = x_t - α * m_t_hat / (sqrt(v_t_hat) + ε)
157
158    Attributes
159    -------------------------------------------------------------------------------------
160    step_size : float
161        The initial step size provided during initialization.
162
163    beta1 : float
164        The exponential decay rate for the first moment estimates.
165
166    beta2 : float
167        The exponential decay rate for the second moment estimates.
168
169    vel1 : 1-D array_like
170        First moment estimate.
171
172    vel2 : 1-D array_like
173        Second moment estimate.
174
175    eps : float
176        Small constant to prevent division by zero.
177
178    _step_size : float
179        Private attribute for temporarily modifying step size.
180
181    temp_vel1 : 1-D array_like
182        Temporary first moment estimate.
183
184    temp_vel2 : 1-D array_like
185        Temporary Second moment estimate.
186
187    Methods
188    -------------------------------------------------------------------------------------
189    apply_update(control, gradient, **kwargs):
190        Apply an Adam update to the control parameter.
191
192    apply_backtracking():
193        Apply backtracking by reducing step size temporarily.
194
195    restore_parameters():
196        Restore the original step size.
197
198    References
199    -------------------------------------------------------------------------------------
200    [1] Kingma, D. P., & Ba, J. (2014).
201        Adam: A Method for Stochastic Optimization.
202        arXiv preprint arXiv:1412.6980.
203    """
204
205    def __init__(self, step_size, beta1=0.9, beta2=0.999):
206        """
207        A class implementing the Adam optimizer for gradient-based optimization.
208        The Adam update equation for the control x using gradient g, 
209        iteration t, and small constants ε is given by:
210
211            m_t = β1 * m_{t-1} + (1 - β1) * g   \n
212            v_t = β2 * v_{t-1} + (1 - β2) * g^2 \n
213            m_t_hat = m_t / (1 - β1^t)          \n
214            v_t_hat = v_t / (1 - β2^t)          \n
215            x_{t+1} = x_t - α * m_t_hat / (sqrt(v_t_hat) + ε)
216
217        Parameters
218        -------------------------------------------------------------------------------------
219        step_size : float
220            The step size (learning rate) for the optimization.
221
222        beta1 : float, optional
223            The exponential decay rate for the first moment estimates (default is 0.9).
224
225        beta2 : float, optional
226            The exponential decay rate for the second moment estimates (default is 0.999).
227
228        """
229        self.step_size = step_size
230        self.beta1 = beta1
231        self.beta2 = beta2
232        self.vel1  = 0
233        self.vel2  = 0
234        self.eps   = 1e-7
235
236        self._step_size = step_size
237        self.temp_vel1 = 0
238        self.temp_vel2 = 0
239
240    def apply_update(self, control, gradient, **kwargs):
241        """
242        Apply a gradient update to the control parameter.
243        NOTE: This is the steepest decent update: x_new = x_old - x_step.
244
245        Parameters
246        -------------------------------------------------------------------------------------
247        control : array_like
248            The current value of the parameter being optimized.
249
250        gradient : array_like
251            The gradient of the objective function with respect to the control parameter.
252
253        **kwargs : dict
254            Additional keyword arguments, including 'iter' for the current iteration.
255
256        Returns
257        -------------------------------------------------------------------------------------
258        new_control, temp_velocity: tuple
259            The new value of the control parameter after the update, and the current state step.
260        """
261        iter  = kwargs['iter'] 
262        alpha = self._step_size
263        beta1 = self.beta1
264        beta2 = self.beta2
265
266        self.temp_vel1 = beta1*self.vel1 + (1-beta1)*gradient
267        self.temp_vel2 = beta2*self.vel2 + (1-beta2)*gradient**2
268        vel1_hat  = self.temp_vel1/(1-beta1**iter)
269        vel2_hat  = self.temp_vel2/(1-beta2**iter)
270
271        step = alpha*vel1_hat/(np.sqrt(vel2_hat)+self.eps)
272        new_control = control - step  # steepest decent
273        return new_control, step
274
275    def apply_backtracking(self):
276        """
277        Apply backtracking by reducing step size temporarily.
278        """
279        self._step_size = 0.5*self._step_size
280    
281    def restore_parameters(self):
282        """
283        Restore the original step size.
284        """
285        self.vel1 = self.temp_vel1
286        self.vel2 = self.temp_vel2
287        self._step_size = self.step_size
288
289    def get_step_size(self):
290        return self._step_size

A class implementing the Adam optimizer for gradient-based optimization. The Adam update equation for the control x using gradient g, iteration t, and small constants ε is given by:

m_t = β1 * m_{t-1} + (1 - β1) * g   

v_t = β2 * v_{t-1} + (1 - β2) * g^2 

m_t_hat = m_t / (1 - β1^t)          

v_t_hat = v_t / (1 - β2^t)          

x_{t+1} = x_t - α * m_t_hat / (sqrt(v_t_hat) + ε)
Attributes
  • step_size (float): The initial step size provided during initialization.
  • beta1 (float): The exponential decay rate for the first moment estimates.
  • beta2 (float): The exponential decay rate for the second moment estimates.
  • vel1 (1-D array_like): First moment estimate.
  • vel2 (1-D array_like): Second moment estimate.
  • eps (float): Small constant to prevent division by zero.
  • _step_size (float): Private attribute for temporarily modifying step size.
  • temp_vel1 (1-D array_like): Temporary first moment estimate.
  • temp_vel2 (1-D array_like): Temporary Second moment estimate.
Methods

apply_update(control, gradient, **kwargs): Apply an Adam update to the control parameter.

apply_backtracking(): Apply backtracking by reducing step size temporarily.

restore_parameters(): Restore the original step size.

References

[1] Kingma, D. P., & Ba, J. (2014). Adam: A Method for Stochastic Optimization. arXiv preprint arXiv:1412.6980.

Adam(step_size, beta1=0.9, beta2=0.999)
205    def __init__(self, step_size, beta1=0.9, beta2=0.999):
206        """
207        A class implementing the Adam optimizer for gradient-based optimization.
208        The Adam update equation for the control x using gradient g, 
209        iteration t, and small constants ε is given by:
210
211            m_t = β1 * m_{t-1} + (1 - β1) * g   \n
212            v_t = β2 * v_{t-1} + (1 - β2) * g^2 \n
213            m_t_hat = m_t / (1 - β1^t)          \n
214            v_t_hat = v_t / (1 - β2^t)          \n
215            x_{t+1} = x_t - α * m_t_hat / (sqrt(v_t_hat) + ε)
216
217        Parameters
218        -------------------------------------------------------------------------------------
219        step_size : float
220            The step size (learning rate) for the optimization.
221
222        beta1 : float, optional
223            The exponential decay rate for the first moment estimates (default is 0.9).
224
225        beta2 : float, optional
226            The exponential decay rate for the second moment estimates (default is 0.999).
227
228        """
229        self.step_size = step_size
230        self.beta1 = beta1
231        self.beta2 = beta2
232        self.vel1  = 0
233        self.vel2  = 0
234        self.eps   = 1e-7
235
236        self._step_size = step_size
237        self.temp_vel1 = 0
238        self.temp_vel2 = 0

A class implementing the Adam optimizer for gradient-based optimization. The Adam update equation for the control x using gradient g, iteration t, and small constants ε is given by:

m_t = β1 * m_{t-1} + (1 - β1) * g   

v_t = β2 * v_{t-1} + (1 - β2) * g^2 

m_t_hat = m_t / (1 - β1^t)          

v_t_hat = v_t / (1 - β2^t)          

x_{t+1} = x_t - α * m_t_hat / (sqrt(v_t_hat) + ε)
Parameters
  • step_size (float): The step size (learning rate) for the optimization.
  • beta1 (float, optional): The exponential decay rate for the first moment estimates (default is 0.9).
  • beta2 (float, optional): The exponential decay rate for the second moment estimates (default is 0.999).
step_size
beta1
beta2
vel1
vel2
eps
temp_vel1
temp_vel2
def apply_update(self, control, gradient, **kwargs):
240    def apply_update(self, control, gradient, **kwargs):
241        """
242        Apply a gradient update to the control parameter.
243        NOTE: This is the steepest decent update: x_new = x_old - x_step.
244
245        Parameters
246        -------------------------------------------------------------------------------------
247        control : array_like
248            The current value of the parameter being optimized.
249
250        gradient : array_like
251            The gradient of the objective function with respect to the control parameter.
252
253        **kwargs : dict
254            Additional keyword arguments, including 'iter' for the current iteration.
255
256        Returns
257        -------------------------------------------------------------------------------------
258        new_control, temp_velocity: tuple
259            The new value of the control parameter after the update, and the current state step.
260        """
261        iter  = kwargs['iter'] 
262        alpha = self._step_size
263        beta1 = self.beta1
264        beta2 = self.beta2
265
266        self.temp_vel1 = beta1*self.vel1 + (1-beta1)*gradient
267        self.temp_vel2 = beta2*self.vel2 + (1-beta2)*gradient**2
268        vel1_hat  = self.temp_vel1/(1-beta1**iter)
269        vel2_hat  = self.temp_vel2/(1-beta2**iter)
270
271        step = alpha*vel1_hat/(np.sqrt(vel2_hat)+self.eps)
272        new_control = control - step  # steepest decent
273        return new_control, step

Apply a gradient update to the control parameter. NOTE: This is the steepest decent update: x_new = x_old - x_step.

Parameters
  • control (array_like): The current value of the parameter being optimized.
  • gradient (array_like): The gradient of the objective function with respect to the control parameter.
  • **kwargs (dict): Additional keyword arguments, including 'iter' for the current iteration.
Returns
  • new_control, temp_velocity (tuple): The new value of the control parameter after the update, and the current state step.
def apply_backtracking(self):
275    def apply_backtracking(self):
276        """
277        Apply backtracking by reducing step size temporarily.
278        """
279        self._step_size = 0.5*self._step_size

Apply backtracking by reducing step size temporarily.

def restore_parameters(self):
281    def restore_parameters(self):
282        """
283        Restore the original step size.
284        """
285        self.vel1 = self.temp_vel1
286        self.vel2 = self.temp_vel2
287        self._step_size = self.step_size

Restore the original step size.

def get_step_size(self):
289    def get_step_size(self):
290        return self._step_size
class AdaMax(Adam):
293class AdaMax(Adam):
294    '''
295    AdaMax optimizer
296    
297    References
298    -------------------------------------------------------------------------------------
299    [1] Kingma, D. P., & Ba, J. (2014).
300        Adam: A Method for Stochastic Optimization.
301        arXiv preprint arXiv:1412.6980.
302    '''
303    def __init__(self, step_size, beta1=0.9, beta2=0.999):
304        super().__init__(step_size, beta1, beta2)
305    
306    def apply_update(self, control, gradient, **kwargs):
307        iter  = kwargs['iter'] 
308        alpha = self._step_size
309        beta1 = self.beta1
310        beta2 = self.beta2
311
312        self.temp_vel1 = beta1*self.vel1 + (1-beta1)*gradient
313        self.temp_vel2 = np.maximum(beta2*self.vel2, np.abs(gradient))
314        
315        step = alpha/(1-beta1**iter) * self.temp_vel1/self.temp_vel2
316        new_control = control - step 
317        return new_control, step

AdaMax optimizer

References

[1] Kingma, D. P., & Ba, J. (2014). Adam: A Method for Stochastic Optimization. arXiv preprint arXiv:1412.6980.

AdaMax(step_size, beta1=0.9, beta2=0.999)
303    def __init__(self, step_size, beta1=0.9, beta2=0.999):
304        super().__init__(step_size, beta1, beta2)

A class implementing the Adam optimizer for gradient-based optimization. The Adam update equation for the control x using gradient g, iteration t, and small constants ε is given by:

m_t = β1 * m_{t-1} + (1 - β1) * g   

v_t = β2 * v_{t-1} + (1 - β2) * g^2 

m_t_hat = m_t / (1 - β1^t)          

v_t_hat = v_t / (1 - β2^t)          

x_{t+1} = x_t - α * m_t_hat / (sqrt(v_t_hat) + ε)
Parameters
  • step_size (float): The step size (learning rate) for the optimization.
  • beta1 (float, optional): The exponential decay rate for the first moment estimates (default is 0.9).
  • beta2 (float, optional): The exponential decay rate for the second moment estimates (default is 0.999).
def apply_update(self, control, gradient, **kwargs):
306    def apply_update(self, control, gradient, **kwargs):
307        iter  = kwargs['iter'] 
308        alpha = self._step_size
309        beta1 = self.beta1
310        beta2 = self.beta2
311
312        self.temp_vel1 = beta1*self.vel1 + (1-beta1)*gradient
313        self.temp_vel2 = np.maximum(beta2*self.vel2, np.abs(gradient))
314        
315        step = alpha/(1-beta1**iter) * self.temp_vel1/self.temp_vel2
316        new_control = control - step 
317        return new_control, step

Apply a gradient update to the control parameter. NOTE: This is the steepest decent update: x_new = x_old - x_step.

Parameters
  • control (array_like): The current value of the parameter being optimized.
  • gradient (array_like): The gradient of the objective function with respect to the control parameter.
  • **kwargs (dict): Additional keyword arguments, including 'iter' for the current iteration.
Returns
  • new_control, temp_velocity (tuple): The new value of the control parameter after the update, and the current state step.