[Home]  [Edit this page]  [Recent Changes]  [Special Pages]  [Help
CppReinforcementLearning

(C++) Reinforcement learning

Epsilon learning, simple action value method

From http://richelbilderbeek.nl/CppNarmedBanditLearner.htm:

Give a learner a set of actions, with every action having a certain expected reward. Actions taken are evaluated by adjusting the expected rewards. The chance an action is equal to its expected reward of the sum of expected rewards.

  1. include <iostream>
  2. include <vector>
  3. include <algorithm>
  4. include <numeric>
  5. include <cassert>
//Returns a random value from 0.0 to (not including) 1.0 //From http://www.codepedia.com/1/CppRand double uniform() { return static_cast<double>(std::rand())/static_cast<double>(RAND_MAX); } //Select an index from a std::vector of chances //Asserts all chances are bigger or equal 0.0 int SelectIndex(const std::vector<double>& chances) { const int nChances = chances.size(); const double sumChances = std::accumulate(chances.begin(), chances.end(),0.0); if (sumChances == 0.0) return std::rand() % nChances; double p = uniform() * sumChances; int i = -1; //The index that will be chosen for (i=0; i<nChances; ++i) { assert(chances[i] >= 0.0); p-=chances[i]; if (p<0) break; } assert(i != nChances); return i; } //Default chances for the N-armed bandit std::vector<double> GetDefaultChances() { std::vector<double> chances; chances.push_back(0.1); chances.push_back(0.2); chances.push_back(0.4); chances.push_back(0.6); chances.push_back(0.8); chances.push_back(0.9); std::random_shuffle(chances.begin(), chances.end()); return chances; } struct Learner { Learner(const std::vector<double>& initRewards) : mRewards(initRewards), mLearningRate(0.01), mEpsilon(0.01) { } int SelectAction() const { //Explore or exploit? if (uniform() < mEpsilon) { //Select random action mLastAction = std::rand() % mRewards.size(); } else { //Selects an action probabilistically //and stores this action mLastAction = SelectIndex(mRewards); } return mLastAction; } void EvaluateAction(const double& reward) { //Assert mLastAction is a valid index assert(mLastAction >= 0 && mLastAction < static_cast<int>(mRewards.size())); //Change the expected reward mRewards[mLastAction] += (mLearningRate * (reward - mRewards[mLastAction])); } const std::vector<double>& GetRewards() const { return mRewards; } private: std::vector<double> mRewards; double mLearningRate; double mEpsilon; mutable int mLastAction; }; struct NarmedBandit { NarmedBandit(const std::vector<double>& chances) : mChances(chances) { } bool PlayAction(const int& action) const { //Assert action is a valid index assert(action >= 0 && action < static_cast<int>(mChances.size())); //Determine the chance to win of this arm const double pWin = mChances[action]; if (uniform() < pWin) return true; //Won! else return false; //Lost } const std::vector<double> mChances; }; //A functor, see e.g. http://www.codepedia.com/1/CppFunctor template <typename T> struct Couter { void operator()(const T& t) { std::cout << t << std::endl; } }; int main() { //Create a bandit with 6 arms NarmedBandit bandit(GetDefaultChances()); //Create a learner with 6 actions (all with expected reward 0.0) Learner learner(std::vector<double>(6,0.0)); //Assume that the bandit has as many arms as the learner has actions assert(bandit.mChances.size() == learner.GetRewards().size()); //Let the learner take many actions for (int i=0; i<1000; ++i) { { //Optional output //Show start of action std::cout << std::endl << "Starting action " << i << std::endl; //Show the expected rewards of the learner std::for_each(learner.GetRewards().begin(), learner.GetRewards().end(),Couter<double>()); } //Let the learner select an action const int action = learner.SelectAction(); const bool wasSuccess = bandit.PlayAction(action); const double reward = (wasSuccess ? 1.0 : 0.0); learner.EvaluateAction(reward); } //Show the expected rewards of the learner std::cout << std::endl << "Final expected rewards of the learner: " << std::endl; std::for_each(learner.GetRewards().begin(), learner.GetRewards().end(),Couter<double>()); //Show the bandit's chances std::cout << std::endl << "Winchances of the bandit: " << std::endl; std::for_each(bandit.mChances.begin(), bandit.mChances.end(),Couter<double>()); std::cout << "Program done" << std::endl; std::cin.get(); }


Epsilon learning, simple action value method with softmax selection

From http://richelbilderbeek.nl/CppNarmedBanditLearner.htm:

Same as above, except that the chance an action is selected equals a Gibbs or Boltzmann distribution:

           exp( Q(a) / tau)
p(a) = -----------------------------
           SUM ( exp( Q(a) / tau) )
p(a) : chance action a is selected
exp( ) : natural exponential
Q(a) : expected value of action a
tau: temperature
SUM: summed over all actions


  1. include <iostream>
  2. include <vector>
  3. include <algorithm>
  4. include <numeric>
  5. include <cassert>
  6. include <cmath>
//Returns a random value from 0.0 to (not including) 1.0 //From http://www.codepedia.com/1/CppRand double uniform() { return static_cast<double>(std::rand())/static_cast<double>(RAND_MAX); } struct GibbsFunctor : public std::binary_function<double, double, double> { GibbsFunctor(const double& tau) : mTau(tau) {} double operator() (const double& sum, const double& x) const { return sum + Gibbs(x); } const double mTau; double Gibbs(const double& x) const { return std::exp( x / mTau ); } static double Gibbs(const double& x, const double tau) { return std::exp( x / tau ); } }; int SelectIndexSoftmax(const std::vector<double>& chances, const double& tau) { const int nChances = chances.size(); const double sumChances = std::accumulate(chances.begin(), chances.end(),0.0,GibbsFunctor(tau)); if (sumChances == 0.0) return std::rand() % nChances; double p = uniform() * sumChances; int i = -1; //The index that will be chosen for (i=0; i<nChances; ++i) { assert(chances[i] >= 0.0); const double chance = GibbsFunctor::Gibbs(chances[i],tau); assert(chance >= 0.0); p-=chance; if (p<0.0) break; } assert(i != nChances); return i; } //Default chances for the N-armed bandit std::vector<double> GetDefaultChances() { std::vector<double> chances; chances.push_back(0.1); chances.push_back(0.2); chances.push_back(0.4); chances.push_back(0.6); chances.push_back(0.8); chances.push_back(0.9); std::random_shuffle(chances.begin(), chances.end()); return chances; } struct Learner { Learner(const std::vector<double>& initRewards) : mRewards(initRewards), mLearningRate(0.01), mEpsilon(0.01), mTau(0.1) { } int SelectAction() const { //Explore or exploit? if (uniform() < mEpsilon) { //Select random action mLastAction = std::rand() % mRewards.size(); } else { //Selects an action probabilistically //and stores this action mLastAction = SelectIndexSoftmax(mRewards, mTau); } return mLastAction; } void EvaluateAction(const double& reward) { //Assert mLastAction is a valid index assert(mLastAction >= 0 && mLastAction < static_cast<int>(mRewards.size())); //Change the expected reward mRewards[mLastAction] += (mLearningRate * (reward - mRewards[mLastAction])); } const std::vector<double>& GetRewards() const { return mRewards; } private: std::vector<double> mRewards; double mLearningRate; double mEpsilon; double mTau; mutable int mLastAction; }; struct NarmedBandit { NarmedBandit(const std::vector<double>& chances) : mChances(chances) { } bool PlayAction(const int& action) const { //Assert action is a valid index assert(action >= 0 && action < static_cast<int>(mChances.size())); //Determine the chance to win of this arm const double pWin = mChances[action]; if (uniform() < pWin) return true; //Won! else return false; //Lost } const std::vector<double> mChances; }; template <typename T> struct Couter { void operator()(const T& t) { std::cout << t << std::endl; } }; int main() { //Create a bandit with 6 arms NarmedBandit bandit(GetDefaultChances()); //Create a learner with 6 actions (all with expected reward 0.0) Learner learner(std::vector<double>(6,0.0)); //Assume that the bandit has as many arms as the learner has actions assert(bandit.mChances.size() == learner.GetRewards().size()); //Keep track of the total rewards gathered by the learner double sumReward = 0.0; //Let the learner take many actions for (int i=0; i<10000; ++i) { { //Optional output //Show start of action //std::cout << std::endl << "Starting action " << i << std::endl; //Show the expected rewards of the learner //std::for_each(learner.GetRewards().begin(), learner.GetRewards().end(),Couter<double>()); } //Let the learner select an action const int action = learner.SelectAction(); const bool wasSuccess = bandit.PlayAction(action); const double reward = (wasSuccess ? 1.0 : 0.0); learner.EvaluateAction(reward); //Track the total rewards gained by the learner sumReward+=reward; } //Show the expected rewards of the learner std::cout << std::endl << "Final expected rewards of the learner: " << std::endl; std::for_each(learner.GetRewards().begin(), learner.GetRewards().end(),Couter<double>()); //Show the bandit's chances std::cout << std::endl << "Winchances of the bandit: " << std::endl; std::for_each(bandit.mChances.begin(), bandit.mChances.end(),Couter<double>()); //Show the total reward gained std::cout << std::endl << "The learner gained " << sumReward << " reward in total." <<std::endl; std::cout << "Program done" << std::endl; }


All methods of above in one class

The selection method is put into a Strategy  ?design pattern.

  1. include <iostream>
  2. include <vector>
  3. include <algorithm>
  4. include <numeric>
  5. include <cassert>
  6. include <cmath>
  7. include <ctime>
  8. include <memory>
//Returns a random value from 0.0 to (not including) 1.0 //From http://www.codepedia.com/1/CppRand double uniform() { return static_cast<double>(std::rand())/static_cast<double>(RAND_MAX); } //A State struct DistributionBase { virtual ~DistributionBase() {} virtual int SelectIndex(const std::vector<double>& chances) const = 0; }; struct GibbsFunctor : public std::binary_function<double, double, double> { GibbsFunctor(const double& tau) : mTau(tau) {} double operator() (const double& sum, const double& x) const { return sum + Gibbs(x); } const double mTau; double Gibbs(const double& x) const { return std::exp( x / mTau ); } static double Gibbs(const double& x, const double tau) { return std::exp( x / tau ); } }; struct DistributionEqual : public DistributionBase { int SelectIndex(const std::vector<double>& chances) const { const int nChances = chances.size(); const double sumChances = std::accumulate(chances.begin(), chances.end(),0.0); if (sumChances == 0.0) return std::rand() % nChances; double p = uniform() * sumChances; int i = -1; //The index that will be chosen for (i=0; i<nChances; ++i) { assert(chances[i] >= 0.0); p-=chances[i]; if (p<0.0) break; } assert(i != nChances); return i; } }; struct DistributionSoftmax : public DistributionBase { DistributionSoftmax(const double& tau) : mTau(tau) {} int SelectIndex(const std::vector<double>& chances) const { const int nChances = chances.size(); const double sumChances = std::accumulate(chances.begin(), chances.end(),0.0,GibbsFunctor(mTau)); if (sumChances == 0.0) return std::rand() % nChances; double p = uniform() * sumChances; int i = -1; //The index that will be chosen for (i=0; i<nChances; ++i) { assert(chances[i] >= 0.0); const double chance = GibbsFunctor::Gibbs(chances[i],mTau); assert(chance >= 0.0); p-=chance; if (p<0.0) break; } assert(i != nChances); return i; } const double mTau; }; //Default chances for the N-armed bandit std::vector<double> GetDefaultChances() { std::vector<double> chances; chances.push_back(0.1); chances.push_back(0.2); chances.push_back(0.4); chances.push_back(0.6); chances.push_back(0.8); chances.push_back(0.9); std::random_shuffle(chances.begin(), chances.end()); return chances; } struct Learner { Learner(const std::vector<double>& initRewards) : mRewards(initRewards), mLearningRate(0.01), mEpsilon(0.01) { } int SelectAction() const { //Explore or exploit? if (uniform() < mEpsilon) { //Select random action mLastAction = std::rand() % mRewards.size(); } else { //Selects an action probabilistically //and stores this action assert(mDistribution.get()!=0); mLastAction = mDistribution->SelectIndex(mRewards); } return mLastAction; } void EvaluateAction(const double& reward) { //Assert mLastAction is a valid index assert(mLastAction >= 0 && mLastAction < static_cast<int>(mRewards.size())); //Change the expected reward mRewards[mLastAction] += (mLearningRate * (reward - mRewards[mLastAction])); } const std::vector<double>& GetRewards() const { return mRewards; } void SetDistribution(std::auto_ptr<DistributionBase> distribution) { mDistribution = distribution; } private: std::vector<double> mRewards; std::auto_ptr<DistributionBase> mDistribution; //A State double mLearningRate; double mEpsilon; mutable int mLastAction; }; struct NarmedBandit { NarmedBandit(const std::vector<double>& chances) : mChances(chances) { } bool PlayAction(const int& action) const { //Assert action is a valid index assert(action >= 0 && action < static_cast<int>(mChances.size())); //Determine the chance to win of this arm const double pWin = mChances[action]; if (uniform() < pWin) return true; //Won! else return false; //Lost } const std::vector<double> mChances; }; template <typename T> struct Couter { void operator()(const T& t) { std::cout << t << std::endl; } }; int main() { std::srand(std::clock()); //Create a bandit with 6 arms NarmedBandit bandit(GetDefaultChances()); //Create a learner with 6 actions (all with expected reward 0.0) Learner learner(std::vector<double>(6,0.0)); //Assume that the bandit has as many arms as the learner has actions assert(bandit.mChances.size() == learner.GetRewards().size()); { //Set the distribution, chosen at random const int distributionIndex = (std::rand() >> 4) % 2; //Lower bits have lower randomness switch (distributionIndex) { case 0: //Set the equal distribution std::cout << "The learner chooses an action by normal selection" << std::endl; learner.SetDistribution(std::auto_ptr<DistributionBase>(new DistributionEqual)); break; case 1: //Set the softmax distribution std::cout << "The learner chooses an action by softmax selection" << std::endl; learner.SetDistribution(std::auto_ptr<DistributionBase>(new DistributionSoftmax(0.1))); break; default: assert(!"Should not get here"); } } //Keep track of the total rewards gathered by the learner double sumReward = 0.0; //Let the learner take many actions for (int i=0; i<10000; ++i) { { //Optional output //Show start of action //std::cout << std::endl << "Starting action " << i << std::endl; //Show the expected rewards of the learner //std::for_each(learner.GetRewards().begin(), learner.GetRewards().end(),Couter<double>()); } //Let the learner select an action const int action = learner.SelectAction(); const bool wasSuccess = bandit.PlayAction(action); const double reward = (wasSuccess ? 1.0 : 0.0); learner.EvaluateAction(reward); //Track the total rewards gained by the learner sumReward+=reward; } //Show the expected rewards of the learner std::cout << std::endl << "Final expected rewards of the learner: " << std::endl; std::for_each(learner.GetRewards().begin(), learner.GetRewards().end(),Couter<double>()); //Show the bandit's chances std::cout << std::endl << "Winchances of the bandit: " << std::endl; std::for_each(bandit.mChances.begin(), bandit.mChances.end(),Couter<double>()); //Show the total reward gained std::cout << std::endl << "The learner gained " << sumReward << " reward in total." <<std::endl; std::cout << "Program done" << std::endl; }


'Reinforcement learning' links

Code links



last edited (March 8, 2007) by bilderbikkel, Number of views: 3088, Current Rev: 9 (Diff)

[Edit this page]  [Page history]  [What links here]  [Discuss this topic]  [Printer Friendly]  

Members

Username:

Password:


Register
Forgot Password?




Programmers Heaven - for .NET, Java, C/C++ and WEB Developers!
© 1996-2008 Community Networks Ltd. All rights reserved. Reproduction in whole or in part, in any form or medium without express written permission is prohibited. Violators of this policy may be subject to legal action. Please read Terms Of Use and Privacy Statement for more information. Development by Tore Nestenius at .NET Consultant - Synchron Data.