diff --git a/PEPit/examples/composite_convex_minimization/__init__.py b/PEPit/examples/composite_convex_minimization/__init__.py index c927eec5..d485c665 100644 --- a/PEPit/examples/composite_convex_minimization/__init__.py +++ b/PEPit/examples/composite_convex_minimization/__init__.py @@ -1,5 +1,6 @@ from .accelerated_douglas_rachford_splitting import wc_accelerated_douglas_rachford_splitting from .accelerated_proximal_gradient import wc_accelerated_proximal_gradient +from .accelerated_proximal_gradient_simplified import wc_accelerated_proximal_gradient_simplified from .bregman_proximal_point import wc_bregman_proximal_point from .douglas_rachford_splitting import wc_douglas_rachford_splitting from .douglas_rachford_splitting_contraction import wc_douglas_rachford_splitting_contraction @@ -13,6 +14,7 @@ __all__ = ['accelerated_douglas_rachford_splitting', 'wc_accelerated_douglas_rachford_splitting', 'accelerated_proximal_gradient', 'wc_accelerated_proximal_gradient', + 'accelerated_proximal_gradient_simplified', 'wc_accelerated_proximal_gradient_simplified', 'bregman_proximal_point', 'wc_bregman_proximal_point', 'douglas_rachford_splitting', 'wc_douglas_rachford_splitting', 'douglas_rachford_splitting_contraction', 'wc_douglas_rachford_splitting_contraction', diff --git a/PEPit/examples/composite_convex_minimization/accelerated_proximal_gradient.py b/PEPit/examples/composite_convex_minimization/accelerated_proximal_gradient.py index 08254f28..9c5b3bf6 100644 --- a/PEPit/examples/composite_convex_minimization/accelerated_proximal_gradient.py +++ b/PEPit/examples/composite_convex_minimization/accelerated_proximal_gradient.py @@ -1,3 +1,5 @@ +from math import sqrt + from PEPit import PEP from PEPit.functions import SmoothStronglyConvexFunction from PEPit.functions import ConvexFunction @@ -14,7 +16,7 @@ def wc_accelerated_proximal_gradient(mu, L, n, wrapper="cvxpy", solver=None, ver and where :math:`h` is closed convex and proper. This code computes a worst-case guarantee for the **accelerated proximal gradient** method, - also known as **fast proximal gradient (FPGM)** method. + also known as **fast proximal gradient (FPGM)** method or FISTA [1]. That is, it computes the smallest possible :math:`\\tau(n, L, \\mu)` such that the guarantee .. math :: F(x_n) - F(x_\\star) \\leqslant \\tau(n, L, \\mu) \\|x_0 - x_\\star\\|^2, @@ -26,31 +28,26 @@ def wc_accelerated_proximal_gradient(mu, L, n, wrapper="cvxpy", solver=None, ver :math:`\\tau(n, L, \\mu)` is computed as the worst-case value of :math:`F(x_n) - F(x_\\star)` when :math:`\\|x_0 - x_\\star\\|^2 \\leqslant 1`. - **Algorithm**: Accelerated proximal gradient is described as follows, for :math:`t \in \\{ 0, \\dots, n-1\\}`, + **Algorithm**: Initialize :math:`\\lambda_1=1`, :math:`y_1=x_0`. One iteration of FISTA is described by .. math:: - :nowrap: \\begin{eqnarray} - x_{t+1} & = & \\arg\\min_x \\left\\{h(x)+\\frac{L}{2}\|x-\\left(y_{t} - \\frac{1}{L} \\nabla f(y_t)\\right)\\|^2 \\right\\}, \\\\ - y_{t+1} & = & x_{t+1} + \\frac{i}{i+3} (x_{t+1} - x_{t}), + \\text{Set: }\\lambda_{t+1} & = & \\frac{1 + \\sqrt{4\\lambda_t^2 + 1}}{2}\\\\ + x_t & = & \\arg\\min_x \\left\\{h(x)+\\frac{L}{2}\|x-\\left(y_t - \\frac{1}{L} \\nabla f(y_t)\\right)\\|^2 \\right\\}\\\\ + y_{t+1} & = & x_t + \\frac{\\lambda_t-1}{\\lambda_{t+1}} (x_t-x_{t-1}). \\end{eqnarray} - where :math:`y_{0} = x_0`. - - **Theoretical guarantee**: A **tight** (empirical) worst-case guarantee for FPGM is obtained in - [1, method FPGM1 in Sec. 4.2.1, Table 1 in sec 4.2.2], for :math:`\\mu=0`: - - .. math:: F(x_n) - F_\\star \\leqslant \\frac{2 L}{n^2+5n+2} \\|x_0 - x_\\star\\|^2, + **Theoretical guarantee**: The following worst-case guarantee can be found in e.g., [1, Theorem 4.4]: - which is attained on simple one-dimensional constrained linear optimization problems. + .. math:: f(x_n)-f_\\star \\leqslant \\frac{L}{2}\\frac{\\|x_0-x_\\star\\|^2}{\\lambda_n^2}. **References**: - - `[1] A. Taylor, J. Hendrickx, F. Glineur (2017). - Exact worst-case performance of first-order methods for composite convex optimization. - SIAM Journal on Optimization, 27(3):1283–1313. - `_ + + `[1] A. Beck, M. Teboulle (2009). + A Fast Iterative Shrinkage-Thresholding Algorithm for Linear Inverse Problems. + SIAM journal on imaging sciences, 2009, vol. 2, no 1, p. 183-202. + `_ Args: @@ -84,19 +81,19 @@ def wc_accelerated_proximal_gradient(mu, L, n, wrapper="cvxpy", solver=None, ver (PEPit) Setting up the problem: additional constraints for 0 function(s) (PEPit) Compiling SDP (PEPit) Calling SDP solver - (PEPit) Solver status: optimal (wrapper:cvxpy, solver: MOSEK); optimal value: 0.05263158422835028 + (PEPit) Solver status: optimal (wrapper:cvxpy, solver: MOSEK); optimal value: 0.05167329605152958 (PEPit) Primal feasibility check: - The solver found a Gram matrix that is positive semi-definite up to an error of 5.991982341524508e-09 - All the primal scalar constraints are verified up to an error of 1.4780313955381486e-08 + The solver found a Gram matrix that is positive semi-definite up to an error of 6.64684463996332e-09 + All the primal scalar constraints are verified up to an error of 1.6451693951591295e-08 (PEPit) Dual feasibility check: The solver found a residual matrix that is positive semi-definite All the dual scalar values associated with inequality constraints are nonnegative - (PEPit) The worst-case guarantee proof is perfectly reconstituted up to an error of 7.783914601477293e-08 - (PEPit) Final upper bound (dual): 0.052631589673196755 and lower bound (primal example): 0.05263158422835028 - (PEPit) Duality gap: absolute: 5.444846476465592e-09 and relative: 1.034520726726044e-07 + (PEPit) The worst-case guarantee proof is perfectly reconstituted up to an error of 8.587603813802402e-08 + (PEPit) Final upper bound (dual): 0.051673302055698395 and lower bound (primal example): 0.05167329605152958 + (PEPit) Duality gap: absolute: 6.004168814910393e-09 and relative: 1.1619480996379491e-07 *** Example file: worst-case performance of the Accelerated Proximal Gradient Method in function values*** - PEPit guarantee: f(x_n)-f_* <= 0.0526316 ||x0 - xs||^2 - Theoretical guarantee: f(x_n)-f_* <= 0.0526316 ||x0 - xs||^2 + PEPit guarantee: f(x_n)-f_* <= 0.0516733 ||x0 - xs||^2 + Theoretical guarantee: f(x_n)-f_* <= 0.0661257 ||x0 - xs||^2 """ @@ -118,26 +115,28 @@ def wc_accelerated_proximal_gradient(mu, L, n, wrapper="cvxpy", solver=None, ver # Set the initial constraint that is the distance between x0 and x^* problem.set_initial_condition((x0 - xs) ** 2 <= 1) - # Compute n steps of the accelerated proximal gradient method starting from x0 + # Compute n steps of the accelerated proximal gradient method starting from x0 x_new = x0 y = x0 + lam = 1 for i in range(n): + lam_old = lam + lam = (1 + sqrt(4 * lam_old ** 2 + 1)) / 2 x_old = x_new x_new, _, hx_new = proximal_step(y - 1 / L * f.gradient(y), h, 1 / L) - y = x_new + i / (i + 3) * (x_new - x_old) + y = x_new + (lam_old - 1) / lam * (x_new - x_old) - # Set the performance metric to the function value accuracy + # Set the performance metric to the function value accuracy problem.set_performance_metric((f(x_new) + hx_new) - Fs) # Solve the PEP pepit_verbose = max(verbose, 0) pepit_tau = problem.solve(wrapper=wrapper, solver=solver, verbose=pepit_verbose) - # Compute theoretical guarantee (for comparison) - if mu == 0: - theoretical_tau = 2 * L / (n ** 2 + 5 * n + 2) # tight, see [2], Table 1 (column 1, line 1) - else: - theoretical_tau = 2 * L / (n ** 2 + 5 * n + 2) # not tight (bound for smooth convex functions) + # Theoretical guarantee (for comparison) + theoretical_tau = L / (2 * lam_old ** 2) + + if mu != 0: print('Warning: momentum is tuned for non-strongly convex functions.') # Print conclusion if required diff --git a/PEPit/examples/composite_convex_minimization/accelerated_proximal_gradient_simplified.py b/PEPit/examples/composite_convex_minimization/accelerated_proximal_gradient_simplified.py new file mode 100644 index 00000000..daee4801 --- /dev/null +++ b/PEPit/examples/composite_convex_minimization/accelerated_proximal_gradient_simplified.py @@ -0,0 +1,155 @@ +from PEPit import PEP +from PEPit.functions import SmoothStronglyConvexFunction +from PEPit.functions import ConvexFunction +from PEPit.primitive_steps import proximal_step + + +def wc_accelerated_proximal_gradient_simplified(mu, L, n, wrapper="cvxpy", solver=None, verbose=1): + """ + Consider the composite convex minimization problem + + .. math:: F_\\star \\triangleq \\min_x \\{F(x) \equiv f(x) + h(x)\\}, + + where :math:`f` is :math:`L`-smooth and :math:`\\mu`-strongly convex, + and where :math:`h` is closed convex and proper. + + This code computes a worst-case guarantee for the **accelerated proximal gradient** method, + also known as **fast proximal gradient (FPGM)** method. + That is, it computes the smallest possible :math:`\\tau(n, L, \\mu)` such that the guarantee + + .. math :: F(x_n) - F(x_\\star) \\leqslant \\tau(n, L, \\mu) \\|x_0 - x_\\star\\|^2, + + is valid, where :math:`x_n` is the output of the **accelerated proximal gradient** method, + and where :math:`x_\\star` is a minimizer of :math:`F`. + + In short, for given values of :math:`n`, :math:`L` and :math:`\\mu`, + :math:`\\tau(n, L, \\mu)` is computed as the worst-case value of + :math:`F(x_n) - F(x_\\star)` when :math:`\\|x_0 - x_\\star\\|^2 \\leqslant 1`. + + **Algorithm**: Accelerated proximal gradient is described as follows, for :math:`t \in \\{ 0, \\dots, n-1\\}`, + + .. math:: + :nowrap: + + \\begin{eqnarray} + x_{t+1} & = & \\arg\\min_x \\left\\{h(x)+\\frac{L}{2}\|x-\\left(y_{t} - \\frac{1}{L} \\nabla f(y_t)\\right)\\|^2 \\right\\}, \\\\ + y_{t+1} & = & x_{t+1} + \\frac{i}{i+3} (x_{t+1} - x_{t}), + \\end{eqnarray} + + where :math:`y_{0} = x_0`. + + **Theoretical guarantee**: A **tight** (empirical) worst-case guarantee for FPGM is obtained in + [1, method FPGM1 in Sec. 4.2.1, Table 1 in sec 4.2.2], for :math:`\\mu=0`: + + .. math:: F(x_n) - F_\\star \\leqslant \\frac{2 L}{n^2+5n+2} \\|x_0 - x_\\star\\|^2, + + which is attained on simple one-dimensional constrained linear optimization problems. + + **References**: + + `[1] A. Taylor, J. Hendrickx, F. Glineur (2017). + Exact worst-case performance of first-order methods for composite convex optimization. + SIAM Journal on Optimization, 27(3):1283–1313. + `_ + + + Args: + L (float): the smoothness parameter. + mu (float): the strong convexity parameter. + n (int): number of iterations. + wrapper (str): the name of the wrapper to be used. + solver (str): the name of the solver the wrapper should use. + verbose (int): level of information details to print. + + - -1: No verbose at all. + - 0: This example's output. + - 1: This example's output + PEPit information. + - 2: This example's output + PEPit information + solver details. + + Returns: + pepit_tau (float): worst-case value. + theoretical_tau (float): theoretical value. + + Example: + >>> pepit_tau, theoretical_tau = wc_accelerated_proximal_gradient_simplified(L=1, mu=0, n=4, wrapper="cvxpy", solver=None, verbose=1) + (PEPit) Setting up the problem: size of the Gram matrix: 12x12 + (PEPit) Setting up the problem: performance measure is the minimum of 1 element(s) + (PEPit) Setting up the problem: Adding initial conditions and general constraints ... + (PEPit) Setting up the problem: initial conditions and general constraints (1 constraint(s) added) + (PEPit) Setting up the problem: interpolation conditions for 2 function(s) + Function 1 : Adding 30 scalar constraint(s) ... + Function 1 : 30 scalar constraint(s) added + Function 2 : Adding 20 scalar constraint(s) ... + Function 2 : 20 scalar constraint(s) added + (PEPit) Setting up the problem: additional constraints for 0 function(s) + (PEPit) Compiling SDP + (PEPit) Calling SDP solver + (PEPit) Solver status: optimal (wrapper:cvxpy, solver: MOSEK); optimal value: 0.05263158422835028 + (PEPit) Primal feasibility check: + The solver found a Gram matrix that is positive semi-definite up to an error of 5.991982341524508e-09 + All the primal scalar constraints are verified up to an error of 1.4780313955381486e-08 + (PEPit) Dual feasibility check: + The solver found a residual matrix that is positive semi-definite + All the dual scalar values associated with inequality constraints are nonnegative + (PEPit) The worst-case guarantee proof is perfectly reconstituted up to an error of 7.783914601477293e-08 + (PEPit) Final upper bound (dual): 0.052631589673196755 and lower bound (primal example): 0.05263158422835028 + (PEPit) Duality gap: absolute: 5.444846476465592e-09 and relative: 1.034520726726044e-07 + *** Example file: worst-case performance of the Accelerated Proximal Gradient Method in function values*** + PEPit guarantee: f(x_n)-f_* <= 0.0526316 ||x0 - xs||^2 + Theoretical guarantee: f(x_n)-f_* <= 0.0526316 ||x0 - xs||^2 + + """ + + # Instantiate PEP + problem = PEP() + + # Declare a strongly convex smooth function and a convex function + f = problem.declare_function(SmoothStronglyConvexFunction, mu=mu, L=L) + h = problem.declare_function(ConvexFunction) + F = f + h + + # Start by defining its unique optimal point xs = x_* and its function value Fs = F(x_*) + xs = F.stationary_point() + Fs = F(xs) + + # Then define the starting point x0 + x0 = problem.set_initial_point() + + # Set the initial constraint that is the distance between x0 and x^* + problem.set_initial_condition((x0 - xs) ** 2 <= 1) + + # Compute n steps of the accelerated proximal gradient method starting from x0 + x_new = x0 + y = x0 + for i in range(n): + x_old = x_new + x_new, _, hx_new = proximal_step(y - 1 / L * f.gradient(y), h, 1 / L) + y = x_new + i / (i + 3) * (x_new - x_old) + + # Set the performance metric to the function value accuracy + problem.set_performance_metric((f(x_new) + hx_new) - Fs) + + # Solve the PEP + pepit_verbose = max(verbose, 0) + pepit_tau = problem.solve(wrapper=wrapper, solver=solver, verbose=pepit_verbose) + + # Compute theoretical guarantee (for comparison) + theoretical_tau = 2 * L / (n ** 2 + 5 * n + 2) # tight if mu == 0, see [1], Table 1 (column 1, line 1) + if mu != 0: + print('Warning: momentum is tuned for non-strongly convex functions.') + + # Print conclusion if required + if verbose != -1: + print('*** Example file:' + ' worst-case performance of the Accelerated Proximal Gradient Method in function values***') + print('\tPEPit guarantee:\t f(x_n)-f_* <= {:.6} ||x0 - xs||^2'.format(pepit_tau)) + print('\tTheoretical guarantee:\t f(x_n)-f_* <= {:.6} ||x0 - xs||^2'.format(theoretical_tau)) + + # Return the worst-case guarantee of the evaluated method ( and the reference theoretical value) + return pepit_tau, theoretical_tau + + +if __name__ == "__main__": + pepit_tau, theoretical_tau = wc_accelerated_proximal_gradient_simplified(L=1, mu=0, n=4, + wrapper="cvxpy", solver=None, + verbose=1) diff --git a/PEPit/examples/composite_convex_minimization/three_operator_splitting.py b/PEPit/examples/composite_convex_minimization/three_operator_splitting.py index d56e6a87..774d28ff 100644 --- a/PEPit/examples/composite_convex_minimization/three_operator_splitting.py +++ b/PEPit/examples/composite_convex_minimization/three_operator_splitting.py @@ -97,7 +97,7 @@ def wc_three_operator_splitting(mu1, L1, L3, alpha, theta, n, wrapper="cvxpy", s (PEPit) Final upper bound (dual): 0.4754523347677999 and lower bound (primal example): 0.4754523346392658 (PEPit) Duality gap: absolute: 1.285341277856844e-10 and relative: 2.703407227628939e-10 *** Example file: worst-case performance of the Three Operator Splitting in distance *** - PEPit guarantee: ||w^2_n - w^1_n||^2 <= 0.475452 ||x0 - ws||^2 + PEPit guarantee: ||w^1_n - w^0_n||^2 <= 0.475452 ||w^1_0 - w^0_0||^2 """ diff --git a/PEPit/examples/nonconvex_optimization/gradient_descent_quadratic_lojasiewicz_expensive.py b/PEPit/examples/nonconvex_optimization/gradient_descent_quadratic_lojasiewicz_expensive.py index 4e828461..62403596 100644 --- a/PEPit/examples/nonconvex_optimization/gradient_descent_quadratic_lojasiewicz_expensive.py +++ b/PEPit/examples/nonconvex_optimization/gradient_descent_quadratic_lojasiewicz_expensive.py @@ -106,13 +106,13 @@ def wc_gradient_descent_quadratic_lojasiewicz_expensive(L, mu, gamma, n, wrapper All the dual matrices to lmi are positive semi-definite All the dual scalar values associated with inequality constraints are nonnegative up to an error of 5.671954340368105e-10 (PEPit) The worst-case guarantee proof is perfectly reconstituted up to an error of 2.0306640495891322e-08 - (PEPit) Final upper bound (dual): 0.6832669563172779 and lower bound (primal example): 0.6832669556328734 + (PEPit) Final upper bound (dual): 0.6832669563172779 and lower bound (primal example): 0.6832669556328734 (PEPit) Duality gap: absolute: 6.844044220244427e-10 and relative: 1.0016647466735981e-09 *** Example file: worst-case performance of gradient descent with fixed step-size *** - *** (smooth problem satisfying a Lojasiewicz inequality; expert version) *** + *** (smooth problem satisfying a Lojasiewicz inequality; expensive version) *** PEPit guarantee: f(x_1) - f(x_*) <= 0.683267 (f(x_0)-f_*) Theoretical guarantee: f(x_1) - f(x_*) <= 0.727273 (f(x_0)-f_*) - + """ # Instantiate PEP problem = PEP() @@ -160,7 +160,7 @@ def wc_gradient_descent_quadratic_lojasiewicz_expensive(L, mu, gamma, n, wrapper # Print conclusion if required if verbose != -1: print('*** Example file: worst-case performance of gradient descent with fixed step-size ***') - print('*** \t (smooth problem satisfying a Lojasiewicz inequality; expert version) ***') + print('*** \t (smooth problem satisfying a Lojasiewicz inequality; expensive version) ***') print('\tPEPit guarantee:\t f(x_1) - f(x_*) <= {:.6} (f(x_0)-f_*)'.format(pepit_tau)) print('\tTheoretical guarantee:\t f(x_1) - f(x_*) <= {:.6} (f(x_0)-f_*)'.format(theoretical_tau)) diff --git a/PEPit/examples/nonconvex_optimization/gradient_descent_quadratic_lojasiewicz_intermediate.py b/PEPit/examples/nonconvex_optimization/gradient_descent_quadratic_lojasiewicz_intermediate.py index 82418ba6..8a675989 100644 --- a/PEPit/examples/nonconvex_optimization/gradient_descent_quadratic_lojasiewicz_intermediate.py +++ b/PEPit/examples/nonconvex_optimization/gradient_descent_quadratic_lojasiewicz_intermediate.py @@ -79,7 +79,7 @@ def wc_gradient_descent_quadratic_lojasiewicz_intermediate(L, mu, gamma, n, alph Example: >>> L, mu, gamma, n = 1, .2, 1, 1 >>> alpha = (2*mu/(2*L+mu)) - >>> pepit_tau, theoretical_tau = wc_gradient_descent_refinedLojasiewicz(L=L, gamma=gamma, n=1, alpha=alpha, wrapper="cvxpy", solver=None, verbose=1) + >>> pepit_tau, theoretical_tau = wc_gradient_descent_quadratic_lojasiewicz_intermediate(L=L, gamma=gamma, n=1, alpha=alpha, wrapper="cvxpy", solver=None, verbose=1) (PEPit) Setting up the problem: size of the Gram matrix: 4x4 (PEPit) Setting up the problem: performance measure is the minimum of 1 element(s) (PEPit) Setting up the problem: Adding initial conditions and general constraints ... @@ -98,10 +98,10 @@ def wc_gradient_descent_quadratic_lojasiewicz_intermediate(L, mu, gamma, n, alph The solver found a residual matrix that is positive semi-definite All the dual scalar values associated with inequality constraints are nonnegative up to an error of 5.521136597015314e-11 (PEPit) The worst-case guarantee proof is perfectly reconstituted up to an error of 3.812156896706273e-11 - (PEPit) Final upper bound (dual): 0.7272727272394729 and lower bound (primal example): 0.727272727239017 + (PEPit) Final upper bound (dual): 0.7272727272394729 and lower bound (primal example): 0.727272727239017 (PEPit) Duality gap: absolute: 4.558575739110893e-13 and relative: 6.268041641568012e-13 *** Example file: worst-case performance of gradient descent with fixed step-size *** - *** (smooth problem satisfying a Lojasiewicz inequality; refined version) *** + *** (smooth problem satisfying a Lojasiewicz inequality; intermediate version) *** PEPit guarantee: f(x_1) - f(x_*) <= 0.727273 (f(x_0)-f_*) Theoretical guarantee: f(x_1) - f(x_*) <= 0.727273 (f(x_0)-f_*) @@ -152,7 +152,7 @@ def wc_gradient_descent_quadratic_lojasiewicz_intermediate(L, mu, gamma, n, alph # Print conclusion if required if verbose != -1: print('*** Example file: worst-case performance of gradient descent with fixed step-size ***') - print('*** \t (smooth problem satisfying a Lojasiewicz inequality; refined version) ***') + print('*** \t (smooth problem satisfying a Lojasiewicz inequality; intermediate version) ***') print('\tPEPit guarantee:\t f(x_1) - f(x_*) <= {:.6} (f(x_0)-f_*)'.format(pepit_tau)) print('\tTheoretical guarantee:\t f(x_1) - f(x_*) <= {:.6} (f(x_0)-f_*)'.format(theoretical_tau)) diff --git a/PEPit/examples/nonconvex_optimization/gradient_descent_quadratic_lojasiewicz_naive.py b/PEPit/examples/nonconvex_optimization/gradient_descent_quadratic_lojasiewicz_naive.py index 4db81d61..e07253df 100644 --- a/PEPit/examples/nonconvex_optimization/gradient_descent_quadratic_lojasiewicz_naive.py +++ b/PEPit/examples/nonconvex_optimization/gradient_descent_quadratic_lojasiewicz_naive.py @@ -92,10 +92,10 @@ def wc_gradient_descent_quadratic_lojasiewicz_naive(L, mu, gamma, n, wrapper="cv The solver found a residual matrix that is positive semi-definite All the dual scalar values associated with inequality constraints are nonnegative up to an error of 6.45387237307569e-09 (PEPit) The worst-case guarantee proof is perfectly reconstituted up to an error of 3.656794643018536e-08 - (PEPit) Final upper bound (dual): 0.7272727115834279 and lower bound (primal example): 0.7272727088286305 + (PEPit) Final upper bound (dual): 0.7272727115834279 and lower bound (primal example): 0.7272727088286305 (PEPit) Duality gap: absolute: 2.754797390203123e-09 and relative: 3.7878465075914795e-09 *** Example file: worst-case performance of gradient descent with fixed step-size *** - *** (smooth problem satisfying a Lojasiewicz inequality; basic version) *** + *** (smooth problem satisfying a Lojasiewicz inequality; cheap naive version) *** PEPit guarantee: f(x_1) - f(x_*) <= 0.727273 (f(x_0)-f_*) Theoretical guarantee: f(x_1) - f(x_*) <= 0.727273 (f(x_0)-f_*) @@ -146,7 +146,7 @@ def wc_gradient_descent_quadratic_lojasiewicz_naive(L, mu, gamma, n, wrapper="cv # Print conclusion if required if verbose != -1: print('*** Example file: worst-case performance of gradient descent with fixed step-size ***') - print('*** \t (smooth problem satisfying a Lojasiewicz inequality; basic version) ***') + print('*** \t (smooth problem satisfying a Lojasiewicz inequality; cheap naive version) ***') print('\tPEPit guarantee:\t f(x_1) - f(x_*) <= {:.6} (f(x_0)-f_*)'.format(pepit_tau)) print('\tTheoretical guarantee:\t f(x_1) - f(x_*) <= {:.6} (f(x_0)-f_*)'.format(theoretical_tau)) diff --git a/PEPit/examples/online_learning/online_follow_leader.py b/PEPit/examples/online_learning/online_follow_leader.py index face3cfe..fa0af7b4 100644 --- a/PEPit/examples/online_learning/online_follow_leader.py +++ b/PEPit/examples/online_learning/online_follow_leader.py @@ -80,16 +80,16 @@ def wc_online_follow_leader(M, D, n, wrapper="cvxpy", solver=None, verbose=1): (PEPit) Setting up the problem: additional constraints for 0 function(s) (PEPit) Compiling SDP (PEPit) Calling SDP solver - (PEPit) Solver status: optimal (wrapper:cvxpy, solver: MOSEK); optimal value: 0.9330127285845171 + (PEPit) Solver status: optimal (wrapper:cvxpy, solver: MOSEK); optimal value: 0.933012716710238 (PEPit) Primal feasibility check: - The solver found a Gram matrix that is positive semi-definite up to an error of 8.958065311703632e-09 - All the primal scalar constraints are verified up to an error of 3.481293675555719e-08 + The solver found a Gram matrix that is positive semi-definite up to an error of 7.082570717379284e-09 + All the primal scalar constraints are verified up to an error of 2.8049949474251434e-08 (PEPit) Dual feasibility check: The solver found a residual matrix that is positive semi-definite - All the dual scalar values associated with inequality constraints are nonnegative - (PEPit) The worst-case guarantee proof is perfectly reconstituted up to an error of 1.2781705987788661e-07 - (PEPit) Final upper bound (dual): 0.93301273263622 and lower bound (primal example): 0.9330127285845171 - (PEPit) Duality gap: absolute: 4.051702862106765e-09 and relative: 4.342601915253229e-09 + All the dual scalar values associated with inequality constraints are nonnegative up to an error of 2.2603605415766684e-10 + (PEPit) The worst-case guarantee proof is perfectly reconstituted up to an error of 1.0515587808859884e-07 + (PEPit) Final upper bound (dual): 0.9330127181067216 and lower bound (primal example): 0.933012716710238 + (PEPit) Duality gap: absolute: 1.3964835954283217e-09 and relative: 1.4967465827821315e-09 *** Example file: worst-case regret of online follow the leader *** PEPit guarantee: R_n <= 0.933013 diff --git a/PEPit/examples/unconstrained_convex_minimization/__init__.py b/PEPit/examples/unconstrained_convex_minimization/__init__.py index 7bc837da..dd7befc2 100644 --- a/PEPit/examples/unconstrained_convex_minimization/__init__.py +++ b/PEPit/examples/unconstrained_convex_minimization/__init__.py @@ -1,4 +1,5 @@ from .accelerated_gradient_convex import wc_accelerated_gradient_convex +from .accelerated_gradient_convex_simplified import wc_accelerated_gradient_convex_simplified from .accelerated_gradient_strongly_convex import wc_accelerated_gradient_strongly_convex from .accelerated_proximal_point import wc_accelerated_proximal_point from .conjugate_gradient import wc_conjugate_gradient @@ -29,6 +30,7 @@ __all__ = ['accelerated_gradient_convex', 'wc_accelerated_gradient_convex', + 'accelerated_gradient_convex_simplified', 'wc_accelerated_gradient_convex_simplified', 'accelerated_gradient_strongly_convex', 'wc_accelerated_gradient_strongly_convex', 'accelerated_proximal_point', 'wc_accelerated_proximal_point', 'conjugate_gradient', 'wc_conjugate_gradient', diff --git a/PEPit/examples/unconstrained_convex_minimization/accelerated_gradient_convex.py b/PEPit/examples/unconstrained_convex_minimization/accelerated_gradient_convex.py index 92f66d22..79b639a8 100644 --- a/PEPit/examples/unconstrained_convex_minimization/accelerated_gradient_convex.py +++ b/PEPit/examples/unconstrained_convex_minimization/accelerated_gradient_convex.py @@ -1,3 +1,5 @@ +from math import sqrt + from PEPit import PEP from PEPit.functions import SmoothStronglyConvexFunction @@ -10,7 +12,7 @@ def wc_accelerated_gradient_convex(mu, L, n, wrapper="cvxpy", solver=None, verbo where :math:`f` is :math:`L`-smooth and :math:`\\mu`-strongly convex (:math:`\\mu` is possibly 0). - This code computes a worst-case guarantee for an **accelerated gradient method**, a.k.a. **fast gradient method**. + This code computes a worst-case guarantee for an **accelerated gradient method**, a.k.a. **fast gradient method** [1]. That is, it computes the smallest possible :math:`\\tau(n, L, \\mu)` such that the guarantee .. math:: f(x_n) - f_\\star \\leqslant \\tau(n, L, \\mu) \\|x_0 - x_\\star\\|^2 @@ -21,30 +23,32 @@ def wc_accelerated_gradient_convex(mu, L, n, wrapper="cvxpy", solver=None, verbo :math:`\\tau(n, L, \\mu)` is computed as the worst-case value of :math:`f(x_n)-f_\\star` when :math:`\\|x_0 - x_\\star\\|^2 \\leqslant 1`. - **Algorithm**: - The accelerated gradient method of this example is provided by - - .. math:: - :nowrap: + **Algorithm**: Initialize :math:`\\lambda_1=1`, :math:`y_1=x_0`. + One iteration of accelerated gradient method is described by - \\begin{eqnarray} - x_{t+1} & = & y_t - \\frac{1}{L} \\nabla f(y_t) \\\\ - y_{t+1} & = & x_{t+1} + \\frac{t-1}{t+2} (x_{t+1} - x_t). - \\end{eqnarray} + .. math:: - **Theoretical guarantee**: - When :math:`\\mu=0`, a tight **empirical** guarantee can be found in [1, Table 1]: + \\begin{eqnarray} + \\text{Set: }\\lambda_{t+1} & = & \\frac{1 + \\sqrt{4\\lambda_t^2 + 1}}{2} \\\\ + x_{t} & = & y_t - \\frac{1}{L} \\nabla f(y_t),\\\\ + y_{t+1} & = & x_{t} + \\frac{\\lambda_t-1}{\\lambda_{t+1}} (x_t-x_{t-1}). + \\end{eqnarray} - .. math:: f(x_n)-f_\\star \\leqslant \\frac{2L\\|x_0-x_\\star\\|^2}{n^2 + 5 n + 6}, + **Theoretical guarantee**: The following worst-case guarantee can be found in e.g., [2, Theorem 4.4]: - where tightness is obtained on some Huber loss functions. + .. math:: f(x_n)-f_\\star \\leqslant \\frac{L}{2}\\frac{\\|x_0-x_\\star\\|^2}{\\lambda_n^2}. **References**: - - `[1] A. Taylor, J. Hendrickx, F. Glineur (2017). - Exact worst-case performance of first-order methods for composite convex optimization. - SIAM Journal on Optimization, 27(3):1283–1313. - `_ + + `[1] Y. Nesterov (1983). + A method for solving the convex programming problem with convergence rate O(1/k^2). + In Dokl. akad. nauk Sssr (Vol. 269, pp. 543-547). + `_ + + `[2] A. Beck, M. Teboulle (2009). + A Fast Iterative Shrinkage-Thresholding Algorithm for Linear Inverse Problems. + SIAM journal on imaging sciences, 2009, vol. 2, no 1, p. 183-202. + `_ Args: mu (float): the strong convexity parameter @@ -87,7 +91,7 @@ def wc_accelerated_gradient_convex(mu, L, n, wrapper="cvxpy", solver=None, verbo (PEPit) Duality gap: absolute: 3.834839723548811e-09 and relative: 2.3009039102756247e-08 *** Example file: worst-case performance of accelerated gradient method *** PEPit guarantee: f(x_n)-f_* <= 0.166667 ||x_0 - x_*||^2 - Theoretical guarantee: f(x_n)-f_* <= 0.166667 ||x_0 - x_*||^2 + Theoretical guarantee: f(x_n)-f_* <= 0.5 ||x_0 - x_*||^2 """ # Instantiate PEP @@ -107,22 +111,27 @@ def wc_accelerated_gradient_convex(mu, L, n, wrapper="cvxpy", solver=None, verbo problem.set_initial_condition((x0 - xs) ** 2 <= 1) # Run n steps of the fast gradient method - x_new = x0 + x = x0 y = x0 - for i in range(n): - x_old = x_new - x_new = y - 1 / L * func.gradient(y) - y = x_new + i / (i + 3) * (x_new - x_old) + lam = 1 + + for _ in range(n): + lam_old = lam + lam = (1 + sqrt(4 * lam_old ** 2 + 1)) / 2 + x_old = x + x = y - 1 / L * func.gradient(y) + y = x + (lam_old - 1) / lam * (x - x_old) # Set the performance metric to the function value accuracy - problem.set_performance_metric(func(x_new) - fs) + problem.set_performance_metric(func(x) - fs) # Solve the PEP pepit_verbose = max(verbose, 0) pepit_tau = problem.solve(wrapper=wrapper, solver=solver, verbose=pepit_verbose) # Theoretical guarantee (for comparison) - theoretical_tau = 2 * L / (n ** 2 + 5 * n + 6) # tight only for mu=0, see [2], Table 1 (column 1, line 1) + theoretical_tau = L / (2 * lam_old**2) + if mu != 0: print('Warning: momentum is tuned for non-strongly convex functions.') @@ -137,4 +146,5 @@ def wc_accelerated_gradient_convex(mu, L, n, wrapper="cvxpy", solver=None, verbo if __name__ == "__main__": - pepit_tau, theoretical_tau = wc_accelerated_gradient_convex(mu=0, L=1, n=1, wrapper="cvxpy", solver=None, verbose=1) + pepit_tau, theoretical_tau = wc_accelerated_gradient_convex(mu=0, L=1, n=1, wrapper="cvxpy", + solver=None, verbose=1) diff --git a/PEPit/examples/unconstrained_convex_minimization/accelerated_gradient_convex_simplified.py b/PEPit/examples/unconstrained_convex_minimization/accelerated_gradient_convex_simplified.py new file mode 100644 index 00000000..79029f4a --- /dev/null +++ b/PEPit/examples/unconstrained_convex_minimization/accelerated_gradient_convex_simplified.py @@ -0,0 +1,148 @@ +from PEPit import PEP +from PEPit.functions import SmoothStronglyConvexFunction + + +def wc_accelerated_gradient_convex_simplified(mu, L, n, wrapper="cvxpy", solver=None, verbose=1): + """ + Consider the convex minimization problem + + .. math:: f_\\star \\triangleq \\min_x f(x), + + where :math:`f` is :math:`L`-smooth and :math:`\\mu`-strongly convex (:math:`\\mu` is possibly 0). + + This code computes a worst-case guarantee for an **accelerated gradient method**, a.k.a. **fast gradient method** + with a set of classical slightly simplified sets of coefficients compared to the original [1]. + That is, the code computes the smallest possible :math:`\\tau(n, L, \\mu)` such that the guarantee + + .. math:: f(x_n) - f_\\star \\leqslant \\tau(n, L, \\mu) \\|x_0 - x_\\star\\|^2 + + is valid, where :math:`x_n` is the output of the accelerated gradient method below, + and where :math:`x_\\star` is the minimizer of :math:`f`. + In short, for given values of :math:`n`, :math:`L` and :math:`\\mu`, + :math:`\\tau(n, L, \\mu)` is computed as the worst-case value of + :math:`f(x_n)-f_\\star` when :math:`\\|x_0 - x_\\star\\|^2 \\leqslant 1`. + + **Algorithm**: + The accelerated gradient method of this example is provided by + + .. math:: + :nowrap: + + \\begin{eqnarray} + x_{t+1} & = & y_t - \\frac{1}{L} \\nabla f(y_t) \\\\ + y_{t+1} & = & x_{t+1} + \\frac{t-1}{t+2} (x_{t+1} - x_t). + \\end{eqnarray} + + **Theoretical guarantee**: + When :math:`\\mu=0`, a tight **empirical** guarantee can be found in [2, Table 1]: + + .. math:: f(x_n)-f_\\star \\leqslant \\frac{2L\\|x_0-x_\\star\\|^2}{n^2 + 5 n + 6}, + + where tightness is obtained on some Huber loss functions. + + **References**: + + `[1] Y. Nesterov (1983). + A method for solving the convex programming problem with convergence rate O(1/k^2). + In Dokl. akad. nauk Sssr (Vol. 269, pp. 543-547). + `_ + + `[2] A. Taylor, J. Hendrickx, F. Glineur (2017). + Exact worst-case performance of first-order methods for composite convex optimization. + SIAM Journal on Optimization, 27(3):1283–1313. + `_ + + Args: + mu (float): the strong convexity parameter + L (float): the smoothness parameter. + n (int): number of iterations. + wrapper (str): the name of the wrapper to be used. + solver (str): the name of the solver the wrapper should use. + verbose (int): level of information details to print. + + - -1: No verbose at all. + - 0: This example's output. + - 1: This example's output + PEPit information. + - 2: This example's output + PEPit information + solver details. + + Returns: + pepit_tau (float): worst-case value + theoretical_tau (float): theoretical value + + Example: + >>> pepit_tau, theoretical_tau = wc_accelerated_gradient_convex_simplified(mu=0, L=1, n=1, wrapper="cvxpy", solver=None, verbose=1) + (PEPit) Setting up the problem: size of the Gram matrix: 4x4 + (PEPit) Setting up the problem: performance measure is the minimum of 1 element(s) + (PEPit) Setting up the problem: Adding initial conditions and general constraints ... + (PEPit) Setting up the problem: initial conditions and general constraints (1 constraint(s) added) + (PEPit) Setting up the problem: interpolation conditions for 1 function(s) + Function 1 : Adding 6 scalar constraint(s) ... + Function 1 : 6 scalar constraint(s) added + (PEPit) Setting up the problem: additional constraints for 0 function(s) + (PEPit) Compiling SDP + (PEPit) Calling SDP solver + (PEPit) Solver status: optimal (wrapper:cvxpy, solver: MOSEK); optimal value: 0.16666666115098375 + (PEPit) Primal feasibility check: + The solver found a Gram matrix that is positive semi-definite up to an error of 4.82087966328108e-09 + All the primal scalar constraints are verified up to an error of 3.6200406144937247e-09 + (PEPit) Dual feasibility check: + The solver found a residual matrix that is positive semi-definite + All the dual scalar values associated with inequality constraints are nonnegative + (PEPit) The worst-case guarantee proof is perfectly reconstituted up to an error of 3.101096412994053e-08 + (PEPit) Final upper bound (dual): 0.16666666498582347 and lower bound (primal example): 0.16666666115098375 + (PEPit) Duality gap: absolute: 3.834839723548811e-09 and relative: 2.3009039102756247e-08 + *** Example file: worst-case performance of accelerated gradient method *** + PEPit guarantee: f(x_n)-f_* <= 0.166667 ||x_0 - x_*||^2 + Theoretical guarantee: f(x_n)-f_* <= 0.166667 ||x_0 - x_*||^2 + + """ + # Instantiate PEP + problem = PEP() + + # Declare a strongly convex smooth function + func = problem.declare_function(SmoothStronglyConvexFunction, mu=mu, L=L) + + # Start by defining its unique optimal point xs = x_* and corresponding function value fs = f_* + xs = func.stationary_point() + fs = func(xs) + + # Then define the starting point x0 of the algorithm + x0 = problem.set_initial_point() + + # Set the initial constraint that is the distance between x0 and x^* + problem.set_initial_condition((x0 - xs) ** 2 <= 1) + + # Run n steps of the fast gradient method + x_new = x0 + y = x0 + for i in range(n): + x_old = x_new + x_new = y - 1 / L * func.gradient(y) + y = x_new + i / (i + 3) * (x_new - x_old) + + # Set the performance metric to the function value accuracy + problem.set_performance_metric(func(x_new) - fs) + + # Solve the PEP + pepit_verbose = max(verbose, 0) + pepit_tau = problem.solve(wrapper=wrapper, solver=solver, verbose=pepit_verbose) + + # Theoretical guarantee (for comparison) + theoretical_tau = 2 * L / (n ** 2 + 5 * n + 6) # tight only for mu=0, see [2], Table 1 (column 1, line 1) + if mu != 0: + print('Warning: momentum is tuned for non-strongly convex functions.') + + # Print conclusion if required + if verbose != -1: + print('*** Example file: worst-case performance of accelerated gradient method ***') + print('\tPEPit guarantee:\t f(x_n)-f_* <= {:.6} ||x_0 - x_*||^2'.format(pepit_tau)) + print('\tTheoretical guarantee:\t f(x_n)-f_* <= {:.6} ||x_0 - x_*||^2'.format(theoretical_tau)) + + # Return the worst-case guarantee of the evaluated method (and the reference theoretical value) + return pepit_tau, theoretical_tau + + +if __name__ == "__main__": + pepit_tau, theoretical_tau = wc_accelerated_gradient_convex_simplified(mu=0, L=1, n=1, + wrapper="cvxpy", + solver=None, verbose=1) diff --git a/README.md b/README.md index e4594561..5957174a 100644 --- a/README.md +++ b/README.md @@ -318,7 +318,7 @@ as well as for support regarding the continuous integration. [4] A. Taylor, J. Hendrickx, F. Glineur (2017). [Performance Estimation Toolbox (PESTO): automated worst-case analysis of first-order optimization methods](https://adrientaylor.github.io/share/PESTO_CDC_2017.pdf). In 56th IEEE Conference on Decision and Control (CDC). -[5] B. Goujaud, C. Moucer, F. Glineur, J.M. Hendrickx, A.B. Taylor, A. Dieuleveut (2024). +[5] B Goujaud, C. Moucer, F. Glineur, J.M. Hendrickx, A.B. Taylor, A. Dieuleveut (2024). [PEPit: computer-assisted worst-case analyses of first-order optimization methods in Python](https://arxiv.org/pdf/2201.04040). Mathematical Programming Computation 16 (3), 337-367. [6] R.T. Rockafellar (1976). @@ -384,206 +384,212 @@ as well as for support regarding the continuous integration. [26] L. Lessard, B. Recht, A. Packard (2016). [Analysis and design of optimization algorithms via integral quadratic constraints](https://arxiv.org/pdf/1408.3595.pdf). SIAM Journal on Optimization 26(1), 57–95. -[27] P. Patrinos, L. Stella, A. Bemporad (2014). +[27] M. Jaggi (2013). +[Revisiting Frank-Wolfe: Projection-free sparse convex optimization](http://proceedings.mlr.press/v28/jaggi13.pdf). In 30th International Conference on Machine Learning (ICML). + +[28] P. Patrinos, L. Stella, A. Bemporad (2014). [Douglas-Rachford splitting: Complexity estimates and accelerated variants](https://arxiv.org/pdf/1407.6723.pdf). In 53rd IEEE Conference on Decision and Control (CDC). -[28] A. Auslender, M. Teboulle (2006). +[29] A. Auslender, M. Teboulle (2006). [Interior gradient and proximal methods for convex and conic optimization](https://epubs.siam.org/doi/pdf/10.1137/S1052623403427823). SIAM Journal on Optimization 16.3 (2006): 697-725. -[29] E. Ryu, A. Taylor, C. Bergeling, P. Giselsson (2020). +[30] A. Beck, M. Teboulle (2009). +[A Fast Iterative Shrinkage-Thresholding Algorithm for Linear Inverse Problems](https://www.ceremade.dauphine.fr/~carlier/FISTA). SIAM journal on imaging sciences, 2009, vol. 2, no 1, p. 183-202. + +[31] E. Ryu, A. Taylor, C. Bergeling, P. Giselsson (2020). [Operator splitting performance estimation: Tight contraction factors and optimal parameter selection](https://arxiv.org/pdf/1812.00146.pdf). SIAM Journal on Optimization, 30(3), 2251-2271. -[30] P. Giselsson, and S. Boyd (2016). +[32] P. Giselsson, and S. Boyd (2016). [Linear convergence and metric selection in Douglas-Rachford splitting and ADMM](https://arxiv.org/pdf/1410.8479.pdf). IEEE Transactions on Automatic Control, 62(2), 532-544. -[31] J. Park, E. Ryu (2023). +[33] J. Park, E. Ryu (2023). [Accelerated Infeasibility Detection of Constrained Optimization and Fixed-Point Iterations](https://arxiv.org/pdf/2303.15876.pdf). International Conference on Machine Learning. -[32] B. Halpern (1967). +[34] B. Halpern (1967). [Fixed points of nonexpanding maps](https://www.ams.org/journals/bull/1967-73-06/S0002-9904-1967-11864-0/S0002-9904-1967-11864-0.pdf). American Mathematical Society, 73(6), 957–961. -[33] F. Lieder (2021). +[35] F. Lieder (2021). [On the convergence rate of the Halpern-iteration](http://www.optimization-online.org/DB_FILE/2017/11/6336.pdf). Optimization Letters, 15(2), 405-418. -[34] F. Lieder (2018). +[36] F. Lieder (2018). [Projection Based Methods for Conic Linear Programming Optimal First Order Complexities and Norm Constrained Quasi Newton Methods](https://docserv.uni-duesseldorf.de/servlets/DerivateServlet/Derivate-49971/Dissertation.pdf). PhD thesis, HHU Düsseldorf. -[35] J. Park, E. Ryu (2022). +[37] J. Park, E. Ryu (2022). [Exact Optimal Accelerated Complexity for Fixed-Point Iterations](https://proceedings.mlr.press/v162/park22c/park22c.pdf). In 39th International Conference on Machine Learning (ICML). -[36] B. Hu, P. Seiler, L. Lessard (2020). +[38] B. Hu, P. Seiler, L. Lessard (2020). [Analysis of biased stochastic gradient descent using sequential semidefinite programs](https://arxiv.org/pdf/1711.00987.pdf). Mathematical programming. -[37] A. Taylor, F. Bach (2019). +[39] A. Taylor, F. Bach (2019). [Stochastic first-order methods: non-asymptotic and computer-aided analyses via potential functions](https://arxiv.org/pdf/1902.00947.pdf). Conference on Learning Theory (COLT). -[38] A. Defazio (2016). +[40] A. Defazio (2016). [A simple practical accelerated method for finite sums](https://proceedings.neurips.cc/paper/2016/file/4f6ffe13a5d75b2d6a3923922b3922e5-Paper.pdf). Advances in Neural Information Processing Systems (NIPS), 29, 676-684. -[39] A. Defazio, F. Bach, S. Lacoste-Julien (2014). +[41] A. Defazio, F. Bach, S. Lacoste-Julien (2014). [SAGA: A fast incremental gradient method with support for non-strongly convex composite objectives](http://papers.nips.cc/paper/2014/file/ede7e2b6d13a41ddf9f4bdef84fdc737-Paper.pdf). In Advances in Neural Information Processing Systems (NIPS). -[40] S. Lojasiewicz (1963). +[42] S. Lojasiewicz (1963). [Une propriété topologique des sous-ensembles analytiques réels](https://aif.centre-mersenne.org/item/10.5802/aif.1384.pdf). Les équations aux dérivées partielles, 117 (1963), 87–89. -[41] B. Polyak (1963). +[43] B. Polyak (1963). [Gradient methods for the minimisation of functionals USSR Computational Mathematics and Mathematical Physics 3(4), 864–878](https://www.sciencedirect.com/science/article/abs/pii/0041555363903823) -[42] J. Bolte, A. Daniilidis, and A. Lewis (2007). +[44] J. Bolte, A. Daniilidis, and A. Lewis (2007). [The Łojasiewicz inequality for nonsmooth subanalytic functions with applications to subgradient dynamical systems](https://bolte.perso.math.cnrs.fr/Loja.pdf). SIAM Journal on Optimization 17, 1205–1223. -[43] H. Abbaszadehpeivasti, E. de Klerk, M. Zamani (2023). +[45] H. Abbaszadehpeivasti, E. de Klerk, M. Zamani (2023). [Conditions for linear convergence of the gradient method for non-convex optimization](https://arxiv.org/pdf/2204.00647). Optimization Letters. -[44] A. Rubbens, J.M. Hendrickx, A. Taylor (2025). +[46] A. Rubbens, J.M. Hendrickx, A. Taylor (2025). [A constructive approach to strengthen algebraic descriptions of function and operator classes](https://arxiv.org/pdf/2504.14377.pdf) -[45] H. Abbaszadehpeivasti, E. de Klerk, M. Zamani (2021). +[47] H. Abbaszadehpeivasti, E. de Klerk, M. Zamani (2021). [On the rate of convergence of the difference-of-convex algorithm (DCA)](https://arxiv.org/pdf/2109.13566). Journal of Optimization Theory and Applications, 202(1), 475-496. -[46] T. Rotaru, P. Patrinos, F. Glineur (2025). +[48] T. Rotaru, P. Patrinos, F. Glineur (2025). [Tight Analysis of Difference-of-Convex Algorithm (DCA) Improves Convergence Rates for Proximal Gradient Descent](https://arxiv.org/pdf/2503.04486). Journal of Optimization Theory and Applications, 202(1), 475-496. -[47] J. Bolte, S. Sabach, M. Teboulle, Y. Vaisbourd (2018). +[49] J. Bolte, S. Sabach, M. Teboulle, Y. Vaisbourd (2018). [First order methods beyond convexity and Lipschitz gradient continuity with applications to quadratic inverse problems](https://arxiv.org/pdf/1706.06461.pdf). SIAM Journal on Optimization, 28(3), 2131-2151. -[48] Taylor, A. B. (2017). +[50] Taylor, A. B. (2017). [Convex interpolation and performance estimation of first-order methods for convex optimization](https://dial.uclouvain.be/downloader/downloader.php?pid=boreal:182881&datastream=PDF_01). PhD Thesis, UCLouvain. -[49] H. Abbaszadehpeivasti, E. de Klerk, M. Zamani (2021). +[51] H. Abbaszadehpeivasti, E. de Klerk, M. Zamani (2021). [The exact worst-case convergence rate of the gradient method with fixed step lengths for L-smooth functions](https://arxiv.org/pdf/2104.05468v3.pdf). Optimization Letters, 16(6), 1649-1661. -[50] E. Hazan (2016). +[52] E. Hazan (2016). [Introduction to online convex optimization](https://arxiv.org/pdf/1912.13213). Foundations and Trends in Optimization, 2(3-4), 157-325. -[51] J. Weibel, P. Gaillard, W.M. Koolen, A. Taylor (2025). +[53] J. Weibel, P. Gaillard, W.M. Koolen, A. Taylor (2025). [Optimized projection-free algorithms for online learning: construction and worst-case analysis](https://arxiv.org/pdf/2506.05855) -[52] F. Jakob, A. Iannelli (2025). +[54] F. Jakob, A. Iannelli (2025). [Online Convex Optimization and Integral Quadratic Constraints: A new approach to regret analysis](https://arxiv.org/pdf/2503.23600?) -[53] N. Bansal, A. Gupta (2019). +[55] N. Bansal, A. Gupta (2019). [Potential-function proofs for gradient methods](https://arxiv.org/pdf/1712.04581.pdf). Theory of Computing, 15(1), 1-32. -[54] Y. Nesterov (1983). +[56] Y. Nesterov (1983). [A method for solving the convex programming problem with convergence rate O(1/k^2)](http://www.mathnet.ru/links/9bcb158ed2df3d8db3532aafd551967d/dan46009.pdf). In Dokl. akad. nauk Sssr (Vol. 269, pp. 543-547). -[55] Y.-G. Hsieh, F. Iutzeler, J. Malick, P. Mertikopoulos (2019). +[57] Y.-G. Hsieh, F. Iutzeler, J. Malick, P. Mertikopoulos (2019). [On the convergence of single-call stochastic extra-gradient methods](https://arxiv.org/pdf/1908.08465.pdf). Advances in Neural Information Processing Systems, 32:6938–6948, 2019 -[56] E. Gorbunov, A. Taylor, G. Gidel (2022). +[58] E. Gorbunov, A. Taylor, G. Gidel (2022). [Last-Iterate Convergence of Optimistic Gradient Method for Monotone Variational Inequalities](https://arxiv.org/pdf/2205.08446.pdf) -[57] W. Moursi, L. Vandenberghe (2019). +[59] W. Moursi, L. Vandenberghe (2019). [Douglas–Rachford Splitting for the Sum of a Lipschitz Continuous and a Strongly Monotone Operator](https://arxiv.org/pdf/1805.09396.pdf). Journal of Optimization Theory and Applications 183, 179–198. -[58] Y. Cai, A. Oikonomou, W. Zheng (2022). +[60] Y. Cai, A. Oikonomou, W. Zheng (2022). [Tight Last-Iterate Convergence of the Extragradient and the Optimistic Gradient Descent-Ascent Algorithm for Constrained Monotone Variational Inequalities](https://arxiv.org/pdf/2204.09228.pdf) -[59] D. Kim (2021). +[61] D. Kim (2021). [Accelerated proximal point method for maximally monotone operators](https://arxiv.org/pdf/1905.05149v4.pdf). Mathematical Programming, 1-31. -[60] G. Gu, J. Yang (2020). +[62] G. Gu, J. Yang (2020). [Tight sublinear convergence rate of the proximal point algorithm for maximal monotone inclusion problem](https://epubs.siam.org/doi/pdf/10.1137/19M1299049). SIAM Journal on Optimization, 30(3), 1905-1921. -[61] C. Guille-Escuret, B. Goujaud, A. Ibrahim, I. Mitliagkas (2022). +[63] C. Guille-Escuret, B. Goujaud, A. Ibrahim, I. Mitliagkas (2022). [Gradient Descent Is Optimal Under Lower Restricted Secant Inequality And Upper Error Bound](https://arxiv.org/pdf/2203.00342.pdf) -[62] E. De Klerk, F. Glineur, A. Taylor (2017). +[64] E. De Klerk, F. Glineur, A. Taylor (2017). [On the worst-case complexity of the gradient method with exact line search for smooth strongly convex functions](https://link.springer.com/content/pdf/10.1007/s11590-016-1087-4.pdf). Optimization Letters, 11(7), 1185-1199. -[63] E. Ghadimi, H. R. Feyzmahdavian, M. Johansson (2015). +[65] E. Ghadimi, H. R. Feyzmahdavian, M. Johansson (2015). [Global convergence of the Heavy-ball method for convex optimization](https://arxiv.org/pdf/1412.7457.pdf). European Control Conference (ECC). -[64] B. Goujaud, A. Taylor, A. Dieuleveut (2022). +[66] B. Goujaud, A. Taylor, A. Dieuleveut (2022). [Optimal first-order methods for convex functions with a quadratic upper bound](https://arxiv.org/pdf/2205.15033.pdf) -[65] Y. Nesterov (2003). +[67] Y. Nesterov (2003). [Introductory lectures on convex optimization: A basic course](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.693.855&rep=rep1&type=pdf). Springer Science & Business Media. -[66] S. Boyd, L. Xiao, A. Mutapcic (2003). +[68] S. Boyd, L. Xiao, A. Mutapcic (2003). [Subgradient Methods (lecture notes)](https://web.stanford.edu/class/ee392o/subgrad_method.pdf) -[67] Y. Drori, M. Teboulle (2016). +[69] Y. Drori, M. Teboulle (2016). [An optimal variant of Kelley's cutting-plane method](https://arxiv.org/pdf/1409.2636.pdf). Mathematical Programming, 160(1), 321-351. -[68] D. Kim, J. Fessler (2021). +[70] D. Kim, J. Fessler (2021). [Optimizing the efficiency of first-order methods for decreasing the gradient of smooth convex functions](https://arxiv.org/pdf/1803.06600.pdf). Journal of optimization theory and applications, 188(1), 192-219. -[69] N. Bousselmi, J. Hendrickx, F. Glineur (2023). +[71] N. Bousselmi, J. Hendrickx, F. Glineur (2023). [Interpolation Conditions for Linear Operators and applications to Performance Estimation Problems](https://arxiv.org/pdf/2302.08781.pdf). arXiv preprint -[70] Y. Drori (2017). +[72] Y. Drori (2017). [The exact information-based complexity of smooth convex minimization](https://arxiv.org/pdf/1606.01424.pdf). Journal of Complexity, 39, 1-16. -[71] J. M. Altschuler, P. A. Parrilo (2023). +[73] J. M. Altschuler, P. A. Parrilo (2023). [Acceleration by Stepsize Hedging I: Multi-Step Descent and the Silver Stepsize Schedule](https://arxiv.org/abs/2309.07879). arXiv preprint arXiv:2309.07879. -[72] J. M. Altschuler, P. A. Parrilo (2023). +[74] J. M. Altschuler, P. A. Parrilo (2023). [Acceleration by Stepsize Hedging II: Silver Stepsize Schedule for Smooth Convex Optimization](https://arxiv.org/abs/2309.16530). arXiv preprint arXiv:2309.16530. -[73] R.D. Millán, M.P. Machado (2019). +[75] R.D. Millán, M.P. Machado (2019). [Inexact proximal epsilon-subgradient methods for composite convex optimization problems](https://arxiv.org/pdf/1805.10120.pdf). Journal of Global Optimization 75.4 (2019): 1029-1060. -[74] D. Kim, J. Fessler (2016). +[76] D. Kim, J. Fessler (2016). [Optimized first-order methods for smooth convex minimization](https://arxiv.org/pdf/1406.5468.pdf). Mathematical Programming 159.1-2: 81-107. -[75] S. Cyrus, B. Hu, B. Van Scoy, L. Lessard (2018). +[77] S. Cyrus, B. Hu, B. Van Scoy, L. Lessard (2018). [A robust accelerated optimization algorithm for strongly convex functions](https://arxiv.org/pdf/1710.04753.pdf). American Control Conference (ACC). -[76] O. Güler (1992). +[78] O. Güler (1992). [New proximal point algorithms for convex minimization](https://epubs.siam.org/doi/abs/10.1137/0802032?mobileUi=0). SIAM Journal on Optimization, 2(4):649–664. -[77] A. Taylor, Y. Drori (2022). +[79] A. Taylor, Y. Drori (2022). [An optimal gradient method for smooth strongly convex minimization](https://arxiv.org/pdf/2101.09741.pdf). Mathematical Programming. -[78] Van Scoy, B., Freeman, R. A., Lynch, K. M. (2018). +[80] Van Scoy, B., Freeman, R. A., Lynch, K. M. (2018). [The fastest known globally convergent first-order method for minimizing strongly convex functions](http://www.optimization-online.org/DB_FILE/2017/03/5908.pdf). IEEE Control Systems Letters, 2(1), 49-54. -[79] O. Gannot (2021). +[81] O. Gannot (2021). [A frequency-domain analysis of inexact gradient methods](https://arxiv.org/pdf/1912.13494.pdf). Mathematical Programming. -[80] B.T. Polyak (1964). +[82] B.T. Polyak (1964). [Some methods of speeding up the convergence of iteration method](https://www.sciencedirect.com/science/article/pii/0041555364901375). URSS Computational Mathematics and Mathematical Physics. -[81] F. Maryam, H. Hindi, S. Boyd (2003). +[83] F. Maryam, H. Hindi, S. Boyd (2003). [Log-det heuristic for matrix rank minimization with applications to Hankel and Euclidean distance matrices](https://web.stanford.edu/~boyd/papers/pdf/rank_min_heur_hankel.pdf). American Control Conference (ACC). -[82] J.P. Boyle, R.L. Dykstra (1986). +[84] J.P. Boyle, R.L. Dykstra (1986). [A method for finding projections onto the intersection of convex sets in Hilbert spaces](https://link.springer.com/chapter/10.1007/978-1-4613-9940-7_3). Lecture Notes in Statistics. Vol. 37. pp. 28–47. -[83] D. Kim, J. Fessler (2017). +[85] D. Kim, J. Fessler (2017). [On the convergence analysis of the optimized gradient method](https://arxiv.org/pdf/1510.08573.pdf). Journal of Optimization Theory and Applications, 172(1), 187-205. -[84] J. Von Neumann (1949). +[86] J. Von Neumann (1949). [On rings of operators](https://www.jstor.org/stable/1969463). Reduction theory. Annals of Mathematics, pp. 401–485. -[85] A. C. Wilson, B. Recht, M. I. Jordan (2021). +[87] A. C. Wilson, B. Recht, M. I. Jordan (2021). [A Lyapunov analysis of accelerated methods in optimization](https://jmlr.org/papers/volume22/20-195/20-195.pdf). In the Journal of Machine Learning Reasearch (JMLR), 22(113):1−34, 2021. -[86] J.M. Sanz-Serna and K. C. Zygalakis (2021). +[88] J.M. Sanz-Serna and K. C. Zygalakis (2021). [The connections between Lyapunov functions for some optimization algorithms and differential equations](https://arxiv.org/pdf/2009.00673.pdf). In SIAM Journal on Numerical Analysis, 59 pp 1542-1565. -[87] C. Moucer, A. Taylor, F. Bach (2022). +[89] C. Moucer, A. Taylor, F. Bach (2022). [A systematic approach to Lyapunov analyses of continuous-time models in convex optimization](https://arxiv.org/pdf/2205.12772.pdf). In SIAM Journal on Optimization 33 (3), 1558-1586. -[88] W. Su, S. Boyd, E. J. Candès (2016). +[90] W. Su, S. Boyd, E. J. Candès (2016). [A differential equation for modeling Nesterov's accelerated gradient method: Theory and insights](https://jmlr.org/papers/volume17/15-084/15-084.pdf). In the Journal of Machine Learning Research (JMLR). -[89] D. Scieur, V. Roulet, F. Bach and A. D'Aspremont (2017). +[91] D. Scieur, V. Roulet, F. Bach and A. D'Aspremont (2017). [Integration methods and accelerated optimization algorithms](https://papers.nips.cc/paper/2017/file/bf62768ca46b6c3b5bea9515d1a1fc45-Paper.pdf). In Advances in Neural Information Processing Systems (NIPS). -[90] M. Kirszbraun (1934). +[92] M. Kirszbraun (1934). [Uber die zusammenziehende und Lipschitzsche transformationen](https://eudml.org/doc/212681). Fundamenta Mathematicae, 22 (1934). -[91] F.A. Valentine (1943). +[93] F.A. Valentine (1943). [On the extension of a vector function so as to preserve a Lipschitz condition](https://projecteuclid.org/journals/bulletin-of-the-american-mathematical-society/volume-49/issue-2). Bulletin of the American Mathematical Society, 49 (2). -[92] F.A. Valentine (1945). +[94] F.A. Valentine (1945). [A Lipschitz condition preserving extension for a vector function](https://www.jstor.org/stable/2371917). American Journal of Mathematics, 67(1). -[93] H. H. Bauschke and P. L. Combettes (2017). +[95] H. H. Bauschke and P. L. Combettes (2017). [Convex Analysis and Monotone Operator Theory in Hilbert Spaces](https://link.springer.com/book/10.1007/978-3-319-48311-5). Springer New York. -[94] E. Gorbunov, A. Taylor, S. Horváth, G. Gidel (2023). +[96] E. Gorbunov, A. Taylor, S. Horváth, G. Gidel (2023). [Convergence of proximal point and extragradient-based methods beyond monotonicity: the case of negative comonotonicity](https://proceedings.mlr.press/v202/gorbunov23a/gorbunov23a.pdf). International Conference on Machine Learning. diff --git a/docs/source/examples/a.rst b/docs/source/examples/a.rst index 0486dd54..dc75834b 100644 --- a/docs/source/examples/a.rst +++ b/docs/source/examples/a.rst @@ -41,6 +41,11 @@ Accelerated gradient for convex objective .. autofunction:: PEPit.examples.unconstrained_convex_minimization.wc_accelerated_gradient_convex +Simplified accelerated gradient for convex objective +---------------------------------------------------- +.. autofunction:: PEPit.examples.unconstrained_convex_minimization.wc_accelerated_gradient_convex_simplified + + Accelerated gradient for strongly convex objective -------------------------------------------------- .. autofunction:: PEPit.examples.unconstrained_convex_minimization.wc_accelerated_gradient_strongly_convex diff --git a/docs/source/examples/b.rst b/docs/source/examples/b.rst index 207b6b80..df5f0693 100644 --- a/docs/source/examples/b.rst +++ b/docs/source/examples/b.rst @@ -16,11 +16,16 @@ Proximal gradient on quadratics .. autofunction:: PEPit.examples.composite_convex_minimization.wc_proximal_gradient_quadratics -Accelerated proximal gradient ------------------------------ +Accelerated proximal gradient (a.k.a., FISTA) +--------------------------------------------- .. autofunction:: PEPit.examples.composite_convex_minimization.wc_accelerated_proximal_gradient +Simplified accelerated proximal gradient +---------------------------------------- +.. autofunction:: PEPit.examples.composite_convex_minimization.wc_accelerated_proximal_gradient_simplified + + Bregman proximal point ----------------------- .. autofunction:: PEPit.examples.composite_convex_minimization.wc_bregman_proximal_point diff --git a/docs/source/whatsnew/0.4.0.rst b/docs/source/whatsnew/0.4.0.rst index a387acc6..a355fb09 100644 --- a/docs/source/whatsnew/0.4.0.rst +++ b/docs/source/whatsnew/0.4.0.rst @@ -11,7 +11,7 @@ What's new in PEPit 0.4.0 - New primitive step: linearly shifted optimization (minimize given function + linear term), which is among others used in the difference-of-convex algorithm (DCA, also known as the convex-concave procedure). -- New examples: (i) Refined block-coordinate descent (refined interpolation techniques--but computationally more expensive) (ii) introduction of quadratic Lojasiewicz inequalities, examples on gradient descent (with different ways to improse Lojasiewicz inequalities, from naive ones to more advanced SDP-representable ones) (iii) optimistic gradient with more advanced SDP-representable monotonocity/Lipschitz characterization, (iv) difference-of-convex algorithm (DCA, also known as the convex-concave procedure), (v) online learning settings (online gradient descent, online Frank-Wolfe, follow the leader, follow the regularized leader). +- New examples: (i) Refined block-coordinate descent (refined interpolation techniques--but computationally more expensive) (ii) introduction of quadratic Lojasiewicz inequalities, examples on gradient descent (with different ways to improse Lojasiewicz inequalities, from naive ones to more advanced SDP-representable ones) (iii) optimistic gradient with more advanced SDP-representable monotonocity/Lipschitz characterization, (iv) difference-of-convex algorithm (DCA, also known as the convex-concave procedure), (v) online learning settings (online gradient descent, online Frank-Wolfe, follow the leader, follow the regularized leader), (vi) added classical accelerated gradient methods (with FISTA-like inertial coefficients), its previous implemented version is now referred to as a simplified version. - Corrected example: SGD's closed-form solution has been updated. diff --git a/tests/test_examples.py b/tests/test_examples.py index 6d381d1e..d40f6829 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -13,6 +13,7 @@ from PEPit.examples.unconstrained_convex_minimization import wc_gradient_descent_silver_stepsize_strongly_convex from PEPit.examples.unconstrained_convex_minimization import wc_subgradient_method_rsi_eb from PEPit.examples.unconstrained_convex_minimization import wc_accelerated_gradient_convex +from PEPit.examples.unconstrained_convex_minimization import wc_accelerated_gradient_convex_simplified from PEPit.examples.unconstrained_convex_minimization import wc_accelerated_gradient_strongly_convex from PEPit.examples.unconstrained_convex_minimization import wc_accelerated_proximal_point from PEPit.examples.unconstrained_convex_minimization import wc_proximal_point @@ -32,6 +33,7 @@ from PEPit.examples.unconstrained_convex_minimization import wc_cyclic_coordinate_descent from PEPit.examples.composite_convex_minimization import wc_accelerated_douglas_rachford_splitting from PEPit.examples.composite_convex_minimization import wc_accelerated_proximal_gradient +from PEPit.examples.composite_convex_minimization import wc_accelerated_proximal_gradient_simplified from PEPit.examples.composite_convex_minimization import wc_bregman_proximal_point from PEPit.examples.composite_convex_minimization import wc_frank_wolfe from PEPit.examples.composite_convex_minimization import wc_douglas_rachford_splitting @@ -391,6 +393,12 @@ def test_accelerated_gradient_convex(self): mu, L, n = 0, 1, 10 wc, theory = wc_accelerated_gradient_convex(mu, L, n, wrapper=self.wrapper, verbose=self.verbose) + self.assertLessEqual(wc, theory) + + def test_accelerated_gradient_convex_simplified(self): + mu, L, n = 0, 1, 10 + + wc, theory = wc_accelerated_gradient_convex_simplified(mu, L, n, wrapper=self.wrapper, verbose=self.verbose) self.assertAlmostEqual(wc, theory, delta=self.relative_precision * theory) def test_accelerated_gradient_strongly_convex(self): @@ -403,6 +411,12 @@ def test_accelerated_proximal_gradient_method(self): mu, L, n = 0, 1, 5 wc, theory = wc_accelerated_proximal_gradient(mu, L, n, wrapper=self.wrapper, verbose=self.verbose) + self.assertLessEqual(wc, theory) + + def test_accelerated_proximal_gradient_method_simplified(self): + mu, L, n = 0, 1, 5 + + wc, theory = wc_accelerated_proximal_gradient_simplified(mu, L, n, wrapper=self.wrapper, verbose=self.verbose) self.assertAlmostEqual(wc, theory, delta=self.relative_precision * theory) def test_accelerated_douglas_rachford_splitting(self):