"""This is a fork from https://github.com/kvfrans/openai-cartpole/blob/master/cartpole-policygradient.py
The reward function is differernt, and it is:
/\r
|
|\
| \
|  \
|   \
0------>t
/\
|
This is the time "done" reached
"""
#!/usr/bin/env python3
import tensorflow as tf
import numpy as np
import random
import gym

def softmax(x):
e_x = np.exp(x - np.max(x))
out = e_x / e_x.sum()
return out

with tf.variable_scope("policy"):
params = tf.get_variable("policy_parameters", [4, 2])
state = tf.placeholder("float", [None, 4])
actions = tf.placeholder("float", [None, 2])
linear = tf.matmul(state, params)
probabilities = tf.nn.softmax(linear)
good_probabilities = tf.reduce_sum(tf.multiply(probabilities, actions), reduction_indices=)
loss = -tf.reduce_sum(eligibility)
return probabilities, state, actions, advantages, optimizer

with tf.variable_scope("value"):
state = tf.placeholder("float", [None, 4])
newvals = tf.placeholder("float", [None, 1])
w1 = tf.get_variable("w1", [4, 10])
b1 = tf.get_variable("b1", )
h1 = tf.nn.relu(tf.matmul(state, w1) + b1)
w2 = tf.get_variable("w2", [10, 1])
b2 = tf.get_variable("b2", )
calculated = tf.matmul(h1, w2) + b2
diffs = calculated - newvals
loss = tf.nn.l2_loss(diffs)
return calculated, state, newvals, optimizer, loss

vl_calculated, vl_state, vl_newvals, vl_optimizer, vl_loss = value_grad
observation = env.reset()
totalreward = 0
states = []
actions = []
transitions = []
update_vals = []

for _ in range(20000):
env.render(close=not render)
# calculate policy
obs_vector = np.expand_dims(observation, axis=0)
probs = sess.run(pl_calculated, feed_dict={pl_state: obs_vector})
action = 0 if random.uniform(0, 1) < probs else 1
# record the transition
states.append(observation)
actionblank = np.zeros(2)
actionblank[action] = 1
actions.append(actionblank)
# take the action in the environment
old_observation = observation
observation, reward, done, info = env.step(action)
transitions.append((old_observation, action, reward))
totalreward += reward

if done:
break
for index, trans in enumerate(transitions):
'''
这里回报函数和我设计的不一样
'''
obs, action, reward = trans

# calculate discounted monte-carlo return
future_reward = 0
future_transitions = len(transitions) - index
decrease = 1
for index2 in range(future_transitions):
future_reward += transitions[(index2) + index] * decrease
decrease = decrease * 0.97
obs_vector = np.expand_dims(obs, axis=0)
currentval = sess.run(vl_calculated, feed_dict={vl_state: obs_vector})

# update the value function towards new return
update_vals.append(future_reward)

# update value function
update_vals_vector = np.expand_dims(update_vals, axis=1)
sess.run(vl_optimizer, feed_dict={vl_state: states, vl_newvals: update_vals_vector})
# real_vl_loss = sess.run(vl_loss, feed_dict={vl_state: states, vl_newvals: update_vals_vector})

env = gym.make('CartPole-v0')
# env.monitor.start('cartpole-hill/', force=True)
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
for i in range(2000):
if reward == 2000:
print("reward 200")
print(i)
break
t = 0
for _ in range(1000):