diff --git a/HW2_Policy_Graident.ipynb b/HW2_Policy_Graident.ipynb index 3b51aa3..adcce41 100644 --- a/HW2_Policy_Graident.ipynb +++ b/HW2_Policy_Graident.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "collapsed": false }, @@ -26,11 +26,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:gym.envs.registration:Making new env: CartPole-v0\n", + "[2016-10-12 13:50:57,388] Making new env: CartPole-v0\n" + ] + } + ], "source": [ "import gym\n", "import tensorflow as tf\n", @@ -38,11 +47,12 @@ "from policy_gradient import util\n", "from policy_gradient.policy import CategoricalPolicy\n", "from policy_gradient.baselines.linear_feature_baseline import LinearFeatureBaseline\n", + "from IPython.display import clear_output\n", "\n", "np.random.seed(0)\n", "tf.set_random_seed(0)\n", "\n", - "# CartPole-v0 is a MDP with finite state and action space. \n", + "# CartPole-v0 is a MDP with finite st ate and action space. \n", "# In this environment, A pendulum is attached by an un-actuated joint to a cart, \n", "# and the goal is to prevent it from falling over. You can apply a force of +1 or -1 to the cart.\n", "# A reward of +1 is provided for every timestep that the pendulum remains upright. \n", @@ -95,23 +105,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gradients.py:90: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n", + " \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n" + ] + } + ], "source": [ "sess = tf.Session()\n", "\n", "# Construct a neural network to represent policy which maps observed state to action. \n", - "in_dim = util.flatten_space(env.observation_space)\n", - "out_dim = util.flatten_space(env.action_space)\n", + "in_dim = util.flatten_space(env.observation_space) #4\n", + "out_dim = util.flatten_space(env.action_space) #2\n", "hidden_dim = 8\n", "\n", "opt = tf.train.AdamOptimizer(learning_rate=0.01)\n", - "policy = CategoricalPolicy(in_dim, out_dim, hidden_dim, opt, sess)\n", - "\n", - "sess.run(tf.initialize_all_variables())" + "policy = CategoricalPolicy(in_dim, out_dim, hidden_dim, opt, sess)" ] }, { @@ -151,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "collapsed": true }, @@ -176,7 +193,7 @@ " ob = self.env.reset()\n", "\n", " for _ in range(self.path_length):\n", - " a = self.policy.act(ob.reshape(1, -1))\n", + " a = self.policy.act(ob.reshape(1, -1)) # (1,4)\n", " next_ob, r, done, _ = self.env.step(a)\n", " obs.append(ob)\n", " actions.append(a)\n", @@ -200,7 +217,6 @@ " \n", " # `p[\"rewards\"]` is a matrix contains the rewards of each timestep in a sample path\n", " r = util.discount_cumsum(p[\"rewards\"], self.discount_rate)\n", - " \n", " \"\"\"\n", " Problem 4:\n", "\n", @@ -210,7 +226,7 @@ " Sample solution should be only 1 line.\n", " \"\"\"\n", " # YOUR CODE HERE >>>>>>\n", - " # a = ???\n", + " a = r - b\n", " # <<<<<<<<\n", "\n", " p[\"returns\"] = r\n", @@ -230,6 +246,7 @@ " )\n", "\n", " def train(self):\n", + " avg_return_list = []\n", " for i in range(1, self.n_iter + 1):\n", " paths = []\n", " for _ in range(self.n_episode):\n", @@ -238,14 +255,15 @@ " loss = self.policy.train(data[\"observations\"], data[\"actions\"], data[\"advantages\"])\n", " avg_return = np.mean([sum(p[\"rewards\"]) for p in paths])\n", " print(\"Iteration {}: Average Return = {}\".format(i, avg_return))\n", - " \n", + " avg_return_list.append(avg_return)\n", " # CartPole-v0 defines \"solving\" as getting average reward of 195.0 over 100 consecutive trials.\n", " if avg_return >= 195:\n", " print(\"Solve at {} iterations, which equals {} episodes.\".format(i, i*100))\n", " break\n", "\n", " if self.baseline != None:\n", - " self.baseline.fit(paths)" + " self.baseline.fit(paths)\n", + " return avg_return_list, i" ] }, { @@ -254,8 +272,50 @@ "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Iteration 1: Average Return = 18.04\n", + "Iteration 2: Average Return = 18.83\n", + "Iteration 3: Average Return = 21.8\n", + "Iteration 4: Average Return = 20.61\n", + "Iteration 5: Average Return = 23.93\n", + "Iteration 6: Average Return = 23.09\n", + "Iteration 7: Average Return = 27.2\n", + "Iteration 8: Average Return = 29.16\n", + "Iteration 9: Average Return = 31.71\n", + "Iteration 10: Average Return = 32.73\n", + "Iteration 11: Average Return = 34.81\n", + "Iteration 12: Average Return = 37.38\n", + "Iteration 13: Average Return = 39.43\n", + "Iteration 14: Average Return = 40.16\n", + "Iteration 15: Average Return = 41.08\n", + "Iteration 16: Average Return = 49.69\n", + "Iteration 17: Average Return = 50.16\n", + "Iteration 18: Average Return = 45.54\n", + "Iteration 19: Average Return = 49.39\n", + "Iteration 20: Average Return = 54.45\n", + "Iteration 21: Average Return = 48.18\n", + "Iteration 22: Average Return = 52.51\n", + "Iteration 23: Average Return = 49.41\n", + "Iteration 24: Average Return = 56.92\n", + "Iteration 25: Average Return = 55.99\n", + "Iteration 26: Average Return = 51.51\n", + "Iteration 27: Average Return = 62.38\n", + "Iteration 28: Average Return = 55.51\n", + "Iteration 29: Average Return = 58.85\n", + "Iteration 30: Average Return = 62.69\n", + "Iteration 31: Average Return = 57.4\n", + "Iteration 32: Average Return = 58.77\n", + "Iteration 33: Average Return = 65.02\n", + "Iteration 34: Average Return = 65.5" + ] + } + ], "source": [ + "#sess.run(tf.initialize_all_variables())\n", "n_iter = 200\n", "n_episode = 100\n", "path_length = 200\n", @@ -266,7 +326,160 @@ " discount_rate)\n", "\n", "# Train the policy optimizer\n", - "po.train()" + "avg_return_list = []\n", + "iter_list = []\n", + "for i in range(10):\n", + " sess.run(tf.initialize_all_variables())\n", + " avg_return_list_tem, iter = po.train()\n", + " avg_return_list.append(avg_return_list_tem)\n", + " iter_list.append(iter)\n", + " print i, 'trial =', iter\n", + "np.savez('w_baseline', avg_return_list=avg_return_list, iter_list=iter_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "ValueError", + "evalue": "zero-size array to reduction operation maximum which has no identity", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mseaborn\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0msns\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mll\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mzeros\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0miter_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mll\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mavg_return_list\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mavg_return_list\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/numpy/core/fromnumeric.pyc\u001b[0m in \u001b[0;36mamax\u001b[1;34m(a, axis, out, keepdims)\u001b[0m\n\u001b[0;32m 2291\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mAttributeError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2292\u001b[0m return _methods._amax(a, axis=axis,\n\u001b[1;32m-> 2293\u001b[1;33m out=out, **kwargs)\n\u001b[0m\u001b[0;32m 2294\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mamax\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mout\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2295\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/numpy/core/_methods.pyc\u001b[0m in \u001b[0;36m_amax\u001b[1;34m(a, axis, out, keepdims)\u001b[0m\n\u001b[0;32m 24\u001b[0m \u001b[1;31m# small reductions\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_amax\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkeepdims\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 26\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mumr_maximum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkeepdims\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 27\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 28\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_amin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkeepdims\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mValueError\u001b[0m: zero-size array to reduction operation maximum which has no identity" + ] + } + ], + "source": [ + "import seaborn as sns \n", + "import matplotlib.pyplot as plt\n", + "ll = np.zeros([10, np.max(iter_list)])\n", + "for i in range(10):\n", + " ll[i,:len(avg_return_list[i])] = avg_return_list[i]\n", + "s = np.sum(ll, axis=0)\n", + "nonzero = np.sum((ll != 0)+0, axis=0)\n", + "s_ = s/nonzero\n", + "max = np.zeros([np.max(iter_list)])\n", + "min = np.zeros([np.max(iter_list)])\n", + "for i in range(np.max(iter_list)):\n", + " max[i] = 0\n", + " min[i] = 1000\n", + " for j in range(10):\n", + " try:\n", + " if max[i] < avg_return_list[j][i]:\n", + " max[i] = avg_return_list[j][i]\n", + " if min[i] > avg_return_list[j][i]:\n", + " min[i] = avg_return_list[j][i]\n", + " except IndexError:\n", + " a=1\n", + "max = max - s_\n", + "min = s_ - min\n", + "xs=np.linspace(1,np.max(iter_list),np.max(iter_list))\n", + "plt.errorbar(xs, s_, yerr = [min, max])\n", + "plt.xlabel('Iteration')\n", + "plt.ylabel('Avg_return')\n", + "#plt.show()\n", + "plt.savefig('without_variance_reduce_max.png')" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(103,)\n" + ] + } + ], + "source": [ + "print min.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "ValueError", + "evalue": "setting an array element with a sequence.", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mxs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlinspace\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mavg_return_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mavg_return_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mplt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mplot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mxs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mavg_return_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 6\u001b[0m \u001b[0mplt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mxlabel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Iteration'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[0mplt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mylabel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Avg_return'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/matplotlib/pyplot.pyc\u001b[0m in \u001b[0;36mplot\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 3152\u001b[0m \u001b[0max\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhold\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhold\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3153\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3154\u001b[1;33m \u001b[0mret\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0max\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mplot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3155\u001b[0m \u001b[1;32mfinally\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3156\u001b[0m \u001b[0max\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhold\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mwashold\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/matplotlib/__init__.pyc\u001b[0m in \u001b[0;36minner\u001b[1;34m(ax, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1809\u001b[0m warnings.warn(msg % (label_namer, func.__name__),\n\u001b[0;32m 1810\u001b[0m RuntimeWarning, stacklevel=2)\n\u001b[1;32m-> 1811\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0max\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1812\u001b[0m \u001b[0mpre_doc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0minner\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__doc__\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1813\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mpre_doc\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/matplotlib/axes/_axes.pyc\u001b[0m in \u001b[0;36mplot\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1426\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1427\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_lines\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1428\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0madd_line\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mline\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1429\u001b[0m \u001b[0mlines\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mline\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1430\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/matplotlib/axes/_base.pyc\u001b[0m in \u001b[0;36madd_line\u001b[1;34m(self, line)\u001b[0m\n\u001b[0;32m 1697\u001b[0m \u001b[0mline\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset_clip_path\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpatch\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1698\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1699\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_update_line_limits\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mline\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1700\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mline\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_label\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1701\u001b[0m \u001b[0mline\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset_label\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'_line%d'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlines\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/matplotlib/axes/_base.pyc\u001b[0m in \u001b[0;36m_update_line_limits\u001b[1;34m(self, line)\u001b[0m\n\u001b[0;32m 1708\u001b[0m \u001b[0mFigures\u001b[0m \u001b[0mout\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mdata\u001b[0m \u001b[0mlimit\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mgiven\u001b[0m \u001b[0mline\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mupdating\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdataLim\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1709\u001b[0m \"\"\"\n\u001b[1;32m-> 1710\u001b[1;33m \u001b[0mpath\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mline\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_path\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1711\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mpath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvertices\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msize\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1712\u001b[0m \u001b[1;32mreturn\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/matplotlib/lines.pyc\u001b[0m in \u001b[0;36mget_path\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 924\u001b[0m \"\"\"\n\u001b[0;32m 925\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_invalidy\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_invalidx\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 926\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrecache\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 927\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_path\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 928\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/matplotlib/lines.pyc\u001b[0m in \u001b[0;36mrecache\u001b[1;34m(self, always)\u001b[0m\n\u001b[0;32m 618\u001b[0m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mma\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0myconv\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfloat_\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfilled\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnan\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 619\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 620\u001b[1;33m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0myconv\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfloat_\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 621\u001b[0m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 622\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/numpy/core/numeric.pyc\u001b[0m in \u001b[0;36masarray\u001b[1;34m(a, dtype, order)\u001b[0m\n\u001b[0;32m 480\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 481\u001b[0m \"\"\"\n\u001b[1;32m--> 482\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 483\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 484\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0masanyarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mValueError\u001b[0m: setting an array element with a sequence." + ] + } + ], + "source": [ + "import seaborn as sns \n", + "import matplotlib.pyplot as plt\n", + "\n", + "xs=np.linspace(1,len(avg_return_list),len(avg_return_list))\n", + "plt.plot(xs, avg_return_list)\n", + "plt.xlabel('Iteration')\n", + "plt.ylabel('Avg_return')\n", + "plt.show()\n", + "plt.savefig('with_variance_reduce.png')" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "xs=np.linspace(1,len(avg_return_list_n),len(avg_return_list_n))\n", + "plt.plot(xs, avg_return_list_n)\n", + "plt.xlabel('Iteration')\n", + "plt.ylabel('Avg_return')\n", + "plt.show()\n", + "plt.savefig('without_variance_reduce.png')" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "xs=np.linspace(1,len(avg_return_list),len(avg_return_list))\n", + "xs_n=np.linspace(1,len(avg_return_list_n),len(avg_return_list_n))\n", + "plt.plot(xs, avg_return_list, xs_n, avg_return_list_n)\n", + "plt.xlabel('Iteration')\n", + "plt.ylabel('Avg_return')\n", + "plt.legend(['With variance reduction', 'Without variance reduction'], loc='upper left')\n", + "plt.savefig('compare.png')\n", + "plt.show()\n" ] }, { @@ -327,7 +540,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.11" + "version": "2.7.6" } }, "nbformat": 4, diff --git a/compare.png b/compare.png new file mode 100644 index 0000000..063aeb8 Binary files /dev/null and b/compare.png differ diff --git a/policy_gradient/policy.py b/policy_gradient/policy.py index 70b8cdc..bbc45fe 100644 --- a/policy_gradient/policy.py +++ b/policy_gradient/policy.py @@ -1,5 +1,6 @@ import tensorflow as tf import numpy as np +import pdb class CategoricalPolicy(object): def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session): @@ -27,15 +28,15 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session): Sample solution is about 2~4 lines. """ # YOUR CODE HERE >>>>>> - # probs = ??? + # Andrew + h1 = tf.contrib.layers.fully_connected(self._observations, num_outputs=hidden_dim, activation_fn=tf.tanh) + h2 = tf.contrib.layers.fully_connected(h1, num_outputs=out_dim, activation_fn=None) + probs = tf.nn.softmax(h2) # (None,out_dim) # <<<<<<<< - # -------------------------------------------------- # This operation (variable) is used when choosing action during data sampling phase # Shape of probs: [1, n_actions] - act_op = probs[0, :] - # -------------------------------------------------- # Following operations (variables) are used when updating model # Shape of probs: [n_timestep_per_iter, n_actions] @@ -57,7 +58,6 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session): # Add 1e-8 to `probs_vec` so as to prevent log(0) error log_prob = tf.log(probs_vec + 1e-8) - """ Problem 2: @@ -69,12 +69,13 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session): Sample solution is about 1~3 lines. """ # YOUR CODE HERE >>>>>> - # surr_loss = ??? + surr_loss = tf.reduce_mean(tf.mul(log_prob, self._advantages)) # <<<<<<<< - grads_and_vars = self._opt.compute_gradients(surr_loss) - train_op = self._opt.apply_gradients(grads_and_vars, name="train_op") - + #grads_and_vars = self._opt.compute_gradients(surr_loss) + #train_op = self._opt.apply_gradients(grads_and_vars, name="train_op") + + train_op =self._opt.minimize(-surr_loss) # -------------------------------------------------- # This operation (variable) is used when choosing action during data sampling phase self._act_op = act_op diff --git a/policy_gradient/util.py b/policy_gradient/util.py index 4c57674..1d43f69 100644 --- a/policy_gradient/util.py +++ b/policy_gradient/util.py @@ -1,6 +1,8 @@ from gym.spaces import Box, Discrete import numpy as np from scipy.signal import lfilter +import math +import pdb def flatten_space(space): if isinstance(space, Box): @@ -19,7 +21,12 @@ def flatten_space(space): Sample solution is about 1~7 lines. """ -# def discount_cumsum(x, discount_rate): - # YOUR CODE HERE >>>>>> - # return ??? - # <<<<<<<< \ No newline at end of file +def discount_cumsum(x, discount_rate): + # YOUR CODE HERE >>>>>> + discounted_r = np.zeros(len(x)) + num_r = len(x) + for i in range(num_r): + discounted_r[i] = x[i]*math.pow(discount_rate,i) + discounted_r = np.cumsum(discounted_r[::-1]) + return discounted_r[::-1] + # <<<<<<<< diff --git a/report.md b/report.md new file mode 100644 index 0000000..c5f64a7 --- /dev/null +++ b/report.md @@ -0,0 +1,72 @@ +# Policy gradient report +Member: 廖元宏(102061137), 莊景堯(102061145)   +Implement a simple agent with REINFORCE algorithm, which uses the MC sampling and policy gradient. + +## Problem 1~4 + +- Problem 1: construct a simple two layer FC layer for policy prediction +Here we use 2-layer neural network to represent the policy. Make sure you add softmax layer to represent probability distribution. +```python +h1 = tf.contrib.layers.fully_connected(self._observations, num_outputs=hidden_dim, activation_fn=tf.tanh) +h2 = tf.contrib.layers.fully_connected(h1, num_outputs=out_dim, activation_fn=None) +probs = tf.nn.softmax(h2) +``` +Use a simple two-layer perceptron to embed state to action space + +- Problem 2: surrogate loss +Since the optimizer in Tensorflow only support minimizing loss (gradient descent), so we simply add a minus sign to represent **gradient ascent**. +```python +surr_loss = -tf.reduce_mean(tf.mul(log_prob, self._advantages)) +``` + +- Problem 3: accumulated reward +Construct a simple for-loop to calculate the accumulated discounted from the end of the game to the start. + +```python +def discount_cumsum(x, discount_rate): + discounted_r = np.zeros(len(x)) + num_r = len(x) + for i in range(num_r): + discounted_r[i] = x[i]*math.pow(discount_rate,i) + discounted_r = np.cumsum(discounted_r[::-1]) + return discounted_r[::-1] +``` + +- Problem 4: Advantage function + +```python +a = r - b +``` +where a is the advantage function, r is the accumulated reward, and b is the predicted baseline. + +## Problem 5 + +Here I compare the result of with/without variance reduction: + +|With baseline|Wihtout baseline| +|---|---| +||| +

X-axis: iteration, Y-axis: return reward

+ +This figure implies the variance of the case with and without baseline. I run each case for 10 times and record the return reward over each iteration(if the game reaches over 195, the game terminates). The solid line denotes the average return reward through iteration. The upper line implies the max return reward in that iteration, while the lower line implies the min retrun reward in that iteration. +**P.S.** The result is not quite apparent, and I guess that I should run the games for 100 iteration(fixed iteration) for the 10 gmaes and plot the figure +**P.S.** Why we need these variance reduction? Here, we're using REINFORCE, which is known to be with high variance (highly depends on your initial samples) +**P.S.** Actually, the results highly depends on the initial parameter. If the initial return reward is around 30, the agent can reach 195 around 70 iterations; If the initial return reward is around 15, it'll take about 100 iterartion to reach 195. + +## Problem 6 + +The reseaon why we need to standardize the advantage function is that when we calculate the accumulated reward, the immediate reward that we get is exponentially discounted by the discounted factor. This action in latter stage can't learn effeciently. So, If we standardize the advantage function over time steps, in this way we’re always encouraging and discouraging roughly half of the performed actions. Mathematically you can also interpret these tricks as a way of controlling the variance of the policy gradient estimator.   +**Additional survey on general advantage estimation(GAE):**   +- ***High-Dimensional Continuous Control Using Generalized Advantage Estimation*** [[ICLR 2016]](https://arxiv.org/abs/1506.02438) + - John Schulman, Philipp Moritz, Sergey Levine, Michael Jordan, Pieter Abbeel + - In extremely high dimensional task(like continuous control in 3D environment), stability is a key point. + - Propose an effective variance reduction scheme for policy gradients, which called generalized advantage estimation (GAE) + - Motivation of GAE: Supposed we have fixed length of steps, from eq.15, we know that the bias of each advantage function is **k-dependent**. So, as k increases, the biased term becomes more ignorable, while the variance increases and vice versa. (if you found this concept is abstract, think of MC is unbiased but with high variance, while TD is biased, but with los variance) + - ***λ*** is a new concept included in this paper. + - If λ = 0 (like eq.17), then we have low variance, and is biased + - If λ = 1 (like eq.18), then we have high variance, and is unbased + +## Reference + +- [Deep Reinforcement Learning: Pong from Pixels](karpathy.github.io/2016/05/31/rl/) +- [Deep-Reinforcement-Learning-Survey](https://github.com/andrewliao11/Deep-Reinforcement-Learning-Survey) diff --git a/with_variance_reduce_max.png b/with_variance_reduce_max.png new file mode 100644 index 0000000..62ddbd2 Binary files /dev/null and b/with_variance_reduce_max.png differ diff --git a/without_variance_reduce_max.png b/without_variance_reduce_max.png new file mode 100644 index 0000000..6b54c81 Binary files /dev/null and b/without_variance_reduce_max.png differ