Model-free approaches for reinforcement learning (RL) and continuous control find policies based only on past states and rewards, without fitting a model of the system dynamics. They are appealing as they are general purpose and easy to implement; however, they also come with fewer theoretical guarantees than model-based RL. In this work, we present a new model-free algorithm for controlling linear quadratic (LQ) systems, and show that its regret scales as O(T^(x+2/3)) for any small x>0 if the time horizon satisfies T>C^(1/x) for a constant C. The algorithm is based on a reduction of control of Markov decision processes to an expert prediction problem. In practice, it corresponds to a variant of policy iteration with forced exploration, where the policy in each phase is greedy with respect to the average of all previous value functions.
This is the first model-free algorithm for adaptive control of LQ systems that provably achieves sublinear regret and has a polynomial computation cost. Empirically, our algorithm dramatically outperforms standard policy iteration, but performs worse than a model-based approach.
%0 Conference Paper
%1 AYLaSz19
%A Abbasi-Yadkori, Y.
%A Lazic, N.
%A Szepesvári, Cs.
%B AISTATS
%D 2019
%K Decision LQR Markov Processes, dynamical exploitation, exploration learning, linear online systems, theory, vs.
%T Model-Free Linear Quadratic Control via Reduction to Expert Prediction
%X Model-free approaches for reinforcement learning (RL) and continuous control find policies based only on past states and rewards, without fitting a model of the system dynamics. They are appealing as they are general purpose and easy to implement; however, they also come with fewer theoretical guarantees than model-based RL. In this work, we present a new model-free algorithm for controlling linear quadratic (LQ) systems, and show that its regret scales as O(T^(x+2/3)) for any small x>0 if the time horizon satisfies T>C^(1/x) for a constant C. The algorithm is based on a reduction of control of Markov decision processes to an expert prediction problem. In practice, it corresponds to a variant of policy iteration with forced exploration, where the policy in each phase is greedy with respect to the average of all previous value functions.
This is the first model-free algorithm for adaptive control of LQ systems that provably achieves sublinear regret and has a polynomial computation cost. Empirically, our algorithm dramatically outperforms standard policy iteration, but performs worse than a model-based approach.
@inproceedings{AYLaSz19,
abstract = { Model-free approaches for reinforcement learning (RL) and continuous control find policies based only on past states and rewards, without fitting a model of the system dynamics. They are appealing as they are general purpose and easy to implement; however, they also come with fewer theoretical guarantees than model-based RL. In this work, we present a new model-free algorithm for controlling linear quadratic (LQ) systems, and show that its regret scales as O(T^(x+2/3)) for any small x>0 if the time horizon satisfies T>C^(1/x) for a constant C. The algorithm is based on a reduction of control of Markov decision processes to an expert prediction problem. In practice, it corresponds to a variant of policy iteration with forced exploration, where the policy in each phase is greedy with respect to the average of all previous value functions.
This is the first model-free algorithm for adaptive control of LQ systems that provably achieves sublinear regret and has a polynomial computation cost. Empirically, our algorithm dramatically outperforms standard policy iteration, but performs worse than a model-based approach.
},
added-at = {2020-03-17T03:03:01.000+0100},
author = {Abbasi-Yadkori, Y. and Lazic, N. and {Sz}epesv{\'a}ri, {Cs}.},
bdsk-file-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxBNLi4vLi4vLi4vLi4vTGlicmFyeS5wYXBlcnMzL0ZpbGVzL0Q5L0Q5QkEzMkVGLTMyQ0QtNERBRS04MkQzLTgyNjY2QTgyOUY0Ri5wZGZPEQHaAAAAAAHaAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAAAAAAAAQkQAAf////8fRDlCQTMyRUYtMzJDRC00REFFI0ZGRkZGRkZGLnBkZgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/////wAAAAAAAAAAAAAAAAAEAAQAAAogY3UAAAAAAAAAAAAAAAAAAkQ5AAIAWS86VXNlcnM6Y3NhYmE6RG9jdW1lbnRzOkxpYnJhcnkucGFwZXJzMzpGaWxlczpEOTpEOUJBMzJFRi0zMkNELTREQUUtODJEMy04MjY2NkE4MjlGNEYucGRmAAAOAFIAKABEADkAQgBBADMAMgBFAEYALQAzADIAQwBEAC0ANABEAEEARQAtADgAMgBEADMALQA4ADIANgA2ADYAQQA4ADIAOQBGADQARgAuAHAAZABmAA8AGgAMAE0AYQBjAGkAbgB0AG8AcwBoACAASABEABIAV1VzZXJzL2NzYWJhL0RvY3VtZW50cy9MaWJyYXJ5LnBhcGVyczMvRmlsZXMvRDkvRDlCQTMyRUYtMzJDRC00REFFLTgyRDMtODI2NjZBODI5RjRGLnBkZgAAEwABLwAAFQACAAz//wAAAAgADQAaACQAdAAAAAAAAAIBAAAAAAAAAAUAAAAAAAAAAAAAAAAAAAJS},
bdsk-url-1 = {http://ieeexplore.ieee.org/document/8014469/},
bdsk-url-2 = {https://dx.doi.org/10.1109/TAC.2017.2743163},
biburl = {https://www.bibsonomy.org/bibtex/2239cede193d0487ded1141e3fcac42ce/csaba},
booktitle = {AISTATS},
date-added = {2019-02-26 21:45:25 +0000},
date-modified = {2019-03-09 12:29:31 -0800},
interhash = {13d048764655339bb8d0a44cdaa07f28},
intrahash = {239cede193d0487ded1141e3fcac42ce},
keywords = {Decision LQR Markov Processes, dynamical exploitation, exploration learning, linear online systems, theory, vs.},
pdf = {papers/aistats2019_modelfreeLQR.pdf},
rating = {0},
read = {Yes},
timestamp = {2020-03-17T03:03:01.000+0100},
title = {Model-Free Linear Quadratic Control via Reduction to Expert Prediction},
year = 2019
}