diff --git a/HW2_Policy_Graident.ipynb b/HW2_Policy_Graident.ipynb
index 3b51aa3..adcce41 100644
--- a/HW2_Policy_Graident.ipynb
+++ b/HW2_Policy_Graident.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {
     "collapsed": false
    },
@@ -26,11 +26,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:gym.envs.registration:Making new env: CartPole-v0\n",
+      "[2016-10-12 13:50:57,388] Making new env: CartPole-v0\n"
+     ]
+    }
+   ],
    "source": [
     "import gym\n",
     "import tensorflow as tf\n",
@@ -38,11 +47,12 @@
     "from policy_gradient import util\n",
     "from policy_gradient.policy import CategoricalPolicy\n",
     "from policy_gradient.baselines.linear_feature_baseline import LinearFeatureBaseline\n",
+    "from IPython.display import clear_output\n",
     "\n",
     "np.random.seed(0)\n",
     "tf.set_random_seed(0)\n",
     "\n",
-    "# CartPole-v0 is a MDP with finite state and action space. \n",
+    "# CartPole-v0 is a MDP with finite st ate and action space. \n",
     "# In this environment, A pendulum is attached by an un-actuated joint to a cart, \n",
     "# and the goal is to prevent it from falling over. You can apply a force of +1 or -1 to the cart.\n",
     "# A reward of +1 is provided for every timestep that the pendulum remains upright. \n",
@@ -95,23 +105,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gradients.py:90: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
+      "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
+     ]
+    }
+   ],
    "source": [
     "sess = tf.Session()\n",
     "\n",
     "# Construct a neural network to represent policy which maps observed state to action. \n",
-    "in_dim = util.flatten_space(env.observation_space)\n",
-    "out_dim = util.flatten_space(env.action_space)\n",
+    "in_dim = util.flatten_space(env.observation_space)  #4\n",
+    "out_dim = util.flatten_space(env.action_space)      #2\n",
     "hidden_dim = 8\n",
     "\n",
     "opt = tf.train.AdamOptimizer(learning_rate=0.01)\n",
-    "policy = CategoricalPolicy(in_dim, out_dim, hidden_dim, opt, sess)\n",
-    "\n",
-    "sess.run(tf.initialize_all_variables())"
+    "policy = CategoricalPolicy(in_dim, out_dim, hidden_dim, opt, sess)"
    ]
   },
   {
@@ -151,7 +168,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {
     "collapsed": true
    },
@@ -176,7 +193,7 @@
     "        ob = self.env.reset()\n",
     "\n",
     "        for _ in range(self.path_length):\n",
-    "            a = self.policy.act(ob.reshape(1, -1))\n",
+    "            a = self.policy.act(ob.reshape(1, -1)) # (1,4)\n",
     "            next_ob, r, done, _ = self.env.step(a)\n",
     "            obs.append(ob)\n",
     "            actions.append(a)\n",
@@ -200,7 +217,6 @@
     "            \n",
     "            # `p[\"rewards\"]` is a matrix contains the rewards of each timestep in a sample path\n",
     "            r = util.discount_cumsum(p[\"rewards\"], self.discount_rate)\n",
-    "            \n",
     "            \"\"\"\n",
     "            Problem 4:\n",
     "\n",
@@ -210,7 +226,7 @@
     "            Sample solution should be only 1 line.\n",
     "            \"\"\"\n",
     "            # YOUR CODE HERE >>>>>>\n",
-    "            # a = ???\n",
+    "            a = r - b\n",
     "            # <<<<<<<<\n",
     "\n",
     "            p[\"returns\"] = r\n",
@@ -230,6 +246,7 @@
     "        )\n",
     "\n",
     "    def train(self):\n",
+    "        avg_return_list = []\n",
     "        for i in range(1, self.n_iter + 1):\n",
     "            paths = []\n",
     "            for _ in range(self.n_episode):\n",
@@ -238,14 +255,15 @@
     "            loss = self.policy.train(data[\"observations\"], data[\"actions\"], data[\"advantages\"])\n",
     "            avg_return = np.mean([sum(p[\"rewards\"]) for p in paths])\n",
     "            print(\"Iteration {}: Average Return = {}\".format(i, avg_return))\n",
-    "            \n",
+    "            avg_return_list.append(avg_return)\n",
     "            # CartPole-v0 defines \"solving\" as getting average reward of 195.0 over 100 consecutive trials.\n",
     "            if avg_return >= 195:\n",
     "                print(\"Solve at {} iterations, which equals {} episodes.\".format(i, i*100))\n",
     "                break\n",
     "\n",
     "            if self.baseline != None:\n",
-    "                self.baseline.fit(paths)"
+    "                self.baseline.fit(paths)\n",
+    "        return avg_return_list, i"
    ]
   },
   {
@@ -254,8 +272,50 @@
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration 1: Average Return = 18.04\n",
+      "Iteration 2: Average Return = 18.83\n",
+      "Iteration 3: Average Return = 21.8\n",
+      "Iteration 4: Average Return = 20.61\n",
+      "Iteration 5: Average Return = 23.93\n",
+      "Iteration 6: Average Return = 23.09\n",
+      "Iteration 7: Average Return = 27.2\n",
+      "Iteration 8: Average Return = 29.16\n",
+      "Iteration 9: Average Return = 31.71\n",
+      "Iteration 10: Average Return = 32.73\n",
+      "Iteration 11: Average Return = 34.81\n",
+      "Iteration 12: Average Return = 37.38\n",
+      "Iteration 13: Average Return = 39.43\n",
+      "Iteration 14: Average Return = 40.16\n",
+      "Iteration 15: Average Return = 41.08\n",
+      "Iteration 16: Average Return = 49.69\n",
+      "Iteration 17: Average Return = 50.16\n",
+      "Iteration 18: Average Return = 45.54\n",
+      "Iteration 19: Average Return = 49.39\n",
+      "Iteration 20: Average Return = 54.45\n",
+      "Iteration 21: Average Return = 48.18\n",
+      "Iteration 22: Average Return = 52.51\n",
+      "Iteration 23: Average Return = 49.41\n",
+      "Iteration 24: Average Return = 56.92\n",
+      "Iteration 25: Average Return = 55.99\n",
+      "Iteration 26: Average Return = 51.51\n",
+      "Iteration 27: Average Return = 62.38\n",
+      "Iteration 28: Average Return = 55.51\n",
+      "Iteration 29: Average Return = 58.85\n",
+      "Iteration 30: Average Return = 62.69\n",
+      "Iteration 31: Average Return = 57.4\n",
+      "Iteration 32: Average Return = 58.77\n",
+      "Iteration 33: Average Return = 65.02\n",
+      "Iteration 34: Average Return = 65.5"
+     ]
+    }
+   ],
    "source": [
+    "#sess.run(tf.initialize_all_variables())\n",
     "n_iter = 200\n",
     "n_episode = 100\n",
     "path_length = 200\n",
@@ -266,7 +326,160 @@
     "                     discount_rate)\n",
     "\n",
     "# Train the policy optimizer\n",
-    "po.train()"
+    "avg_return_list = []\n",
+    "iter_list = []\n",
+    "for i in range(10):\n",
+    "    sess.run(tf.initialize_all_variables())\n",
+    "    avg_return_list_tem, iter = po.train()\n",
+    "    avg_return_list.append(avg_return_list_tem)\n",
+    "    iter_list.append(iter)\n",
+    "    print i, 'trial =', iter\n",
+    "np.savez('w_baseline', avg_return_list=avg_return_list, iter_list=iter_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "zero-size array to reduction operation maximum which has no identity",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-9-1d9cb952c4a1>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mseaborn\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0msns\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mll\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mzeros\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0miter_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      4\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m     \u001b[0mll\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mavg_return_list\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mavg_return_list\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/numpy/core/fromnumeric.pyc\u001b[0m in \u001b[0;36mamax\u001b[1;34m(a, axis, out, keepdims)\u001b[0m\n\u001b[0;32m   2291\u001b[0m         \u001b[1;32mexcept\u001b[0m \u001b[0mAttributeError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2292\u001b[0m             return _methods._amax(a, axis=axis,\n\u001b[1;32m-> 2293\u001b[1;33m                                 out=out, **kwargs)\n\u001b[0m\u001b[0;32m   2294\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0mamax\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mout\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2295\u001b[0m     \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/numpy/core/_methods.pyc\u001b[0m in \u001b[0;36m_amax\u001b[1;34m(a, axis, out, keepdims)\u001b[0m\n\u001b[0;32m     24\u001b[0m \u001b[1;31m# small reductions\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     25\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_amax\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkeepdims\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 26\u001b[1;33m     \u001b[1;32mreturn\u001b[0m \u001b[0mumr_maximum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkeepdims\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     27\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     28\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_amin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkeepdims\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mValueError\u001b[0m: zero-size array to reduction operation maximum which has no identity"
+     ]
+    }
+   ],
+   "source": [
+    "import seaborn as sns \n",
+    "import matplotlib.pyplot as plt\n",
+    "ll = np.zeros([10, np.max(iter_list)])\n",
+    "for i in range(10):\n",
+    "    ll[i,:len(avg_return_list[i])] = avg_return_list[i]\n",
+    "s = np.sum(ll, axis=0)\n",
+    "nonzero = np.sum((ll != 0)+0, axis=0)\n",
+    "s_ = s/nonzero\n",
+    "max = np.zeros([np.max(iter_list)])\n",
+    "min = np.zeros([np.max(iter_list)])\n",
+    "for i in range(np.max(iter_list)):\n",
+    "    max[i] = 0\n",
+    "    min[i] = 1000\n",
+    "    for j in range(10):\n",
+    "        try:\n",
+    "            if max[i] < avg_return_list[j][i]:\n",
+    "                max[i] = avg_return_list[j][i]\n",
+    "            if min[i] > avg_return_list[j][i]:\n",
+    "                min[i] = avg_return_list[j][i]\n",
+    "        except IndexError:\n",
+    "            a=1\n",
+    "max = max - s_\n",
+    "min = s_ - min\n",
+    "xs=np.linspace(1,np.max(iter_list),np.max(iter_list))\n",
+    "plt.errorbar(xs, s_, yerr = [min, max])\n",
+    "plt.xlabel('Iteration')\n",
+    "plt.ylabel('Avg_return')\n",
+    "#plt.show()\n",
+    "plt.savefig('without_variance_reduce_max.png')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 115,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(103,)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print min.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "setting an array element with a sequence.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-80-d92260406ffa>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[0mxs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlinspace\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mavg_return_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mavg_return_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mplt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mplot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mxs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mavg_return_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      6\u001b[0m \u001b[0mplt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mxlabel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Iteration'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      7\u001b[0m \u001b[0mplt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mylabel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Avg_return'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/matplotlib/pyplot.pyc\u001b[0m in \u001b[0;36mplot\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m   3152\u001b[0m         \u001b[0max\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhold\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhold\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3153\u001b[0m     \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3154\u001b[1;33m         \u001b[0mret\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0max\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mplot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   3155\u001b[0m     \u001b[1;32mfinally\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3156\u001b[0m         \u001b[0max\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhold\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mwashold\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/matplotlib/__init__.pyc\u001b[0m in \u001b[0;36minner\u001b[1;34m(ax, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1809\u001b[0m                     warnings.warn(msg % (label_namer, func.__name__),\n\u001b[0;32m   1810\u001b[0m                                   RuntimeWarning, stacklevel=2)\n\u001b[1;32m-> 1811\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0max\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1812\u001b[0m         \u001b[0mpre_doc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0minner\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__doc__\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1813\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mpre_doc\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/matplotlib/axes/_axes.pyc\u001b[0m in \u001b[0;36mplot\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1426\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1427\u001b[0m         \u001b[1;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_lines\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1428\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0madd_line\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mline\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1429\u001b[0m             \u001b[0mlines\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mline\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1430\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/matplotlib/axes/_base.pyc\u001b[0m in \u001b[0;36madd_line\u001b[1;34m(self, line)\u001b[0m\n\u001b[0;32m   1697\u001b[0m             \u001b[0mline\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset_clip_path\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpatch\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1698\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1699\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_update_line_limits\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mline\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1700\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mline\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_label\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1701\u001b[0m             \u001b[0mline\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset_label\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'_line%d'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlines\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/matplotlib/axes/_base.pyc\u001b[0m in \u001b[0;36m_update_line_limits\u001b[1;34m(self, line)\u001b[0m\n\u001b[0;32m   1708\u001b[0m         \u001b[0mFigures\u001b[0m \u001b[0mout\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mdata\u001b[0m \u001b[0mlimit\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mgiven\u001b[0m \u001b[0mline\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mupdating\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdataLim\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1709\u001b[0m         \"\"\"\n\u001b[1;32m-> 1710\u001b[1;33m         \u001b[0mpath\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mline\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_path\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1711\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mpath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvertices\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msize\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1712\u001b[0m             \u001b[1;32mreturn\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/matplotlib/lines.pyc\u001b[0m in \u001b[0;36mget_path\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    924\u001b[0m         \"\"\"\n\u001b[0;32m    925\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_invalidy\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_invalidx\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 926\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrecache\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    927\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_path\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    928\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/matplotlib/lines.pyc\u001b[0m in \u001b[0;36mrecache\u001b[1;34m(self, always)\u001b[0m\n\u001b[0;32m    618\u001b[0m                 \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mma\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0myconv\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfloat_\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfilled\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnan\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    619\u001b[0m             \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 620\u001b[1;33m                 \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0myconv\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfloat_\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    621\u001b[0m             \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    622\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/numpy/core/numeric.pyc\u001b[0m in \u001b[0;36masarray\u001b[1;34m(a, dtype, order)\u001b[0m\n\u001b[0;32m    480\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    481\u001b[0m     \"\"\"\n\u001b[1;32m--> 482\u001b[1;33m     \u001b[1;32mreturn\u001b[0m \u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    483\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    484\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0masanyarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mValueError\u001b[0m: setting an array element with a sequence."
+     ]
+    }
+   ],
+   "source": [
+    "import seaborn as sns \n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "xs=np.linspace(1,len(avg_return_list),len(avg_return_list))\n",
+    "plt.plot(xs, avg_return_list)\n",
+    "plt.xlabel('Iteration')\n",
+    "plt.ylabel('Avg_return')\n",
+    "plt.show()\n",
+    "plt.savefig('with_variance_reduce.png')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "xs=np.linspace(1,len(avg_return_list_n),len(avg_return_list_n))\n",
+    "plt.plot(xs, avg_return_list_n)\n",
+    "plt.xlabel('Iteration')\n",
+    "plt.ylabel('Avg_return')\n",
+    "plt.show()\n",
+    "plt.savefig('without_variance_reduce.png')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "xs=np.linspace(1,len(avg_return_list),len(avg_return_list))\n",
+    "xs_n=np.linspace(1,len(avg_return_list_n),len(avg_return_list_n))\n",
+    "plt.plot(xs, avg_return_list, xs_n, avg_return_list_n)\n",
+    "plt.xlabel('Iteration')\n",
+    "plt.ylabel('Avg_return')\n",
+    "plt.legend(['With variance reduction', 'Without variance reduction'], loc='upper left')\n",
+    "plt.savefig('compare.png')\n",
+    "plt.show()\n"
    ]
   },
   {
@@ -327,7 +540,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.11"
+   "version": "2.7.6"
   }
  },
  "nbformat": 4,
diff --git a/compare.png b/compare.png
new file mode 100644
index 0000000..063aeb8
Binary files /dev/null and b/compare.png differ
diff --git a/policy_gradient/policy.py b/policy_gradient/policy.py
index 70b8cdc..bbc45fe 100644
--- a/policy_gradient/policy.py
+++ b/policy_gradient/policy.py
@@ -1,5 +1,6 @@
 import tensorflow as tf
 import numpy as np
+import pdb
 
 class CategoricalPolicy(object):
     def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
@@ -27,15 +28,15 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
         Sample solution is about 2~4 lines.
         """
         # YOUR CODE HERE >>>>>>
-        # probs = ???
+		# Andrew
+        h1 = tf.contrib.layers.fully_connected(self._observations, num_outputs=hidden_dim, activation_fn=tf.tanh)
+        h2 = tf.contrib.layers.fully_connected(h1, num_outputs=out_dim, activation_fn=None)
+        probs = tf.nn.softmax(h2)	# (None,out_dim)
         # <<<<<<<<
-
         # --------------------------------------------------
         # This operation (variable) is used when choosing action during data sampling phase
         # Shape of probs: [1, n_actions]
-
         act_op = probs[0, :]
-
         # --------------------------------------------------
         # Following operations (variables) are used when updating model
         # Shape of probs: [n_timestep_per_iter, n_actions]
@@ -57,7 +58,6 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
 
         # Add 1e-8 to `probs_vec` so as to prevent log(0) error
         log_prob = tf.log(probs_vec + 1e-8)
-
         """
         Problem 2:
 
@@ -69,12 +69,13 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
         Sample solution is about 1~3 lines.
         """
         # YOUR CODE HERE >>>>>>
-        # surr_loss = ???
+        surr_loss = tf.reduce_mean(tf.mul(log_prob, self._advantages))
         # <<<<<<<<
 
-        grads_and_vars = self._opt.compute_gradients(surr_loss)
-        train_op = self._opt.apply_gradients(grads_and_vars, name="train_op")
-
+        #grads_and_vars = self._opt.compute_gradients(surr_loss)
+        #train_op = self._opt.apply_gradients(grads_and_vars, name="train_op")
+		
+        train_op  =self._opt.minimize(-surr_loss)
         # --------------------------------------------------
         # This operation (variable) is used when choosing action during data sampling phase
         self._act_op = act_op
diff --git a/policy_gradient/util.py b/policy_gradient/util.py
index 4c57674..1d43f69 100644
--- a/policy_gradient/util.py
+++ b/policy_gradient/util.py
@@ -1,6 +1,8 @@
 from gym.spaces import Box, Discrete
 import numpy as np
 from scipy.signal import lfilter
+import math
+import pdb
 
 def flatten_space(space):
 	if isinstance(space, Box):
@@ -19,7 +21,12 @@ def flatten_space(space):
 Sample solution is about 1~7 lines.
 """
 
-# def discount_cumsum(x, discount_rate):
-    # YOUR CODE HERE >>>>>>
-    # return ???
-    # <<<<<<<<
\ No newline at end of file
+def discount_cumsum(x, discount_rate):
+	# YOUR CODE HERE >>>>>>
+	discounted_r = np.zeros(len(x))
+	num_r = len(x)
+	for i in range(num_r):
+		discounted_r[i] = x[i]*math.pow(discount_rate,i)
+	discounted_r = np.cumsum(discounted_r[::-1])
+	return discounted_r[::-1]
+	# <<<<<<<<
diff --git a/report.md b/report.md
new file mode 100644
index 0000000..c5f64a7
--- /dev/null
+++ b/report.md
@@ -0,0 +1,72 @@
+# Policy gradient report
+Member: 廖元宏(102061137), 莊景堯(102061145)    
+Implement a simple agent with REINFORCE algorithm, which uses the MC sampling and policy gradient.   
+
+## Problem 1~4
+
+- Problem 1: construct a simple two layer FC layer for policy prediction 
+Here we use 2-layer neural network to represent the policy. Make sure you add softmax layer to represent probability distribution.
+```python  
+h1 = tf.contrib.layers.fully_connected(self._observations, num_outputs=hidden_dim, activation_fn=tf.tanh)   
+h2 = tf.contrib.layers.fully_connected(h1, num_outputs=out_dim, activation_fn=None)
+probs = tf.nn.softmax(h2)
+```
+Use a simple two-layer perceptron to embed state to action space
+
+- Problem 2: surrogate loss
+Since the optimizer in Tensorflow only support minimizing loss (gradient descent), so we simply add a minus sign to represent **gradient ascent**.
+```python
+surr_loss = -tf.reduce_mean(tf.mul(log_prob, self._advantages))
+```
+
+- Problem 3: accumulated reward
+Construct a simple for-loop to calculate the accumulated discounted from the end of the game to the start.
+
+```python
+def discount_cumsum(x, discount_rate):
+	discounted_r = np.zeros(len(x))
+  	num_r = len(x)
+  	for i in range(num_r):
+	  	discounted_r[i] = x[i]*math.pow(discount_rate,i)
+	discounted_r = np.cumsum(discounted_r[::-1])
+  	return discounted_r[::-1]
+```   
+
+- Problem 4: Advantage function
+
+```python
+a = r - b
+```
+where a is the advantage function, r is the accumulated reward, and b is the predicted baseline.
+
+## Problem 5
+
+Here I compare the result of with/without variance reduction:  
+
+|With baseline|Wihtout baseline|
+|---|---|
+|<img src="https://github.com/andrewliao11/homework2/blob/master/with_variance_reduce_max.png?raw=true" width="700">|<img src="https://github.com/andrewliao11/homework2/blob/master/without_variance_reduce_max.png?raw=true" width="700">|
+<p align="center">X-axis: iteration, Y-axis: return reward</p>
+
+This figure implies the variance of the case with and without baseline. I run each case for 10 times and record the return reward over each iteration(if the game reaches over 195, the game terminates). The solid line denotes the average return reward through iteration. The upper line implies the max return reward in that iteration, while the lower line implies the min retrun reward in that iteration.   
+**P.S.** The result is not quite apparent, and I guess that I should run the games for 100 iteration(fixed iteration) for the 10 gmaes and plot the figure   
+**P.S.** Why we need these variance reduction? Here, we're using REINFORCE, which is known to be with high variance (highly depends on your initial samples)   
+**P.S.** Actually, the results highly depends on the initial parameter. If the initial return reward is around 30, the agent can reach 195 around 70 iterations; If the initial return reward is around 15, it'll take about 100 iterartion to reach 195.
+
+## Problem 6
+
+The reseaon why we need to standardize the advantage function is that when we calculate the accumulated reward, the immediate reward that we get is exponentially discounted by the discounted factor. This action in latter stage can't learn effeciently. So, If we standardize the advantage function over time steps, in this way we’re always encouraging and discouraging roughly half of the performed actions. Mathematically you can also interpret these tricks as a way of controlling the variance of the policy gradient estimator.    
+**Additional survey on general advantage estimation(GAE):**   
+- ***High-Dimensional Continuous Control Using Generalized Advantage Estimation*** [[ICLR 2016]](https://arxiv.org/abs/1506.02438)
+	- John Schulman, Philipp Moritz, Sergey Levine, Michael Jordan, Pieter Abbeel
+	- In extremely high dimensional task(like continuous control in 3D environment), stability is a key point.
+	- Propose an effective variance reduction scheme for policy gradients, which called generalized advantage estimation (GAE)
+	-  Motivation of GAE: Supposed we have fixed length of steps, from eq.15,  we know that the bias of each advantage function is **k-dependent**. So, as k increases, the biased term becomes more ignorable, while the variance increases and vice versa. (if you found this concept is abstract, think of MC is unbiased but with high variance, while TD is biased, but with los variance)
+	-  ***λ*** is a new concept included in this paper. 
+		-  If λ = 0 (like eq.17), then we have low variance, and is biased
+		-  If λ = 1 (like eq.18), then we have high variance, and is unbased
+
+## Reference
+
+- [Deep Reinforcement Learning: Pong from Pixels](karpathy.github.io/2016/05/31/rl/)
+- [Deep-Reinforcement-Learning-Survey](https://github.com/andrewliao11/Deep-Reinforcement-Learning-Survey)
diff --git a/with_variance_reduce_max.png b/with_variance_reduce_max.png
new file mode 100644
index 0000000..62ddbd2
Binary files /dev/null and b/with_variance_reduce_max.png differ
diff --git a/without_variance_reduce_max.png b/without_variance_reduce_max.png
new file mode 100644
index 0000000..6b54c81
Binary files /dev/null and b/without_variance_reduce_max.png differ