Skip to content

Commit 655dd0e

Browse files
araffinhill-a
authored andcommittedFeb 11, 2019
Release 2.4.1 (hill-a#194)
* Prepare 2.4.1 release * Fix variable not updated in traj_segment_generator
1 parent 5acf88f commit 655dd0e

File tree

6 files changed

+25
-11
lines changed

6 files changed

+25
-11
lines changed
 

‎LICENSE

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
The MIT License
22

33
Copyright (c) 2017 OpenAI (http://openai.com)
4+
Copyright (c) 2018-2019 Stable-Baselines Team
45

56
Permission is hereby granted, free of charge, to any person obtaining a copy
67
of this software and associated documentation files (the "Software"), to deal

‎docs/conf.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def __getattr__(cls, name):
4242
# -- Project information -----------------------------------------------------
4343

4444
project = 'Stable Baselines'
45-
copyright = '2018, Stable Baselines'
45+
copyright = '2018-2019, Stable Baselines'
4646
author = 'Stable Baselines Contributors'
4747

4848
# The short X.Y version

‎docs/misc/changelog.rst

+5-2
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@ Changelog
55

66
For download links, please look at `Github release page <https://github.com/hill-a/stable-baselines/releases>`_.
77

8-
Pre-Release 2.4.1a (WIP)
8+
Release 2.4.1 (2019-02-11)
99
--------------------------
1010

11+
**Bug fixes and improvements**
12+
1113
- fixed computation of training metrics in TRPO and PPO1
1214
- added ``reset_num_timesteps`` keyword when calling train() to continue tensorboard learning curves
1315
- reduced the size taken by tensorboard logs (added a ``full_tensorboard_log`` to enable full logging, which was the previous behavior)
@@ -17,6 +19,7 @@ Pre-Release 2.4.1a (WIP)
1719
- fixed custom policy examples in the doc for DQN and DDPG
1820
- remove gym spaces patch for equality functions
1921
- fixed tensorflow dependency: cpu version was installed overwritting tensorflow-gpu when present.
22+
- fixed a bug in ``traj_segment_generator`` (used in ppo1 and trpo) where ``new`` was not updated. (spotted by @junhyeokahn)
2023

2124

2225
Release 2.4.0 (2019-01-17)
@@ -233,4 +236,4 @@ Contributors (since v2.0.0):
233236
In random order...
234237

235238
Thanks to @bjmuld @iambenzo @iandanforth @r7vme @brendenpetersen @huvar @abhiskk @JohannesAck
236-
@EliasHasle @mrakgr @Bleyddyn @antoine-galataud
239+
@EliasHasle @mrakgr @Bleyddyn @antoine-galataud @junhyeokahn

‎setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@
136136
license="MIT",
137137
long_description=long_description,
138138
long_description_content_type='text/markdown',
139-
version="2.4.1a0",
139+
version="2.4.1",
140140
)
141141

142142
# python setup.py sdist

‎stable_baselines/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,4 @@
99
from stable_baselines.trpo_mpi import TRPO
1010
from stable_baselines.sac import SAC
1111

12-
__version__ = "2.4.1a0"
12+
__version__ = "2.4.1"

‎stable_baselines/trpo_mpi/utils.py

+16-6
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def traj_segment_generator(policy, env, horizon, reward_giver=None, gail=False):
1818
- ob: (np.ndarray) observations
1919
- rew: (numpy float) rewards (if gail is used it is the predicted reward)
2020
- vpred: (numpy float) action logits
21-
- new: (numpy bool) dones (is end of episode)
21+
- dones: (numpy bool) dones (is end of episode -> True if first timestep of an episode)
2222
- ac: (np.ndarray) actions
2323
- prevac: (np.ndarray) previous actions
2424
- nextvpred: (numpy float) next action logits
@@ -32,7 +32,6 @@ def traj_segment_generator(policy, env, horizon, reward_giver=None, gail=False):
3232
# Initialize state variables
3333
step = 0
3434
action = env.action_space.sample() # not used, just so we have the datatype
35-
new = True
3635
observation = env.reset()
3736

3837
cur_ep_ret = 0 # return in current episode
@@ -51,7 +50,7 @@ def traj_segment_generator(policy, env, horizon, reward_giver=None, gail=False):
5150
actions = np.array([action for _ in range(horizon)])
5251
prev_actions = actions.copy()
5352
states = policy.initial_state
54-
done = None
53+
done = True # marks if we're on first timestep of an episode
5554

5655
while True:
5756
prevac = action
@@ -66,9 +65,20 @@ def traj_segment_generator(policy, env, horizon, reward_giver=None, gail=False):
6665
else:
6766
current_it_timesteps = sum(ep_lens) + current_it_len
6867

69-
yield {"ob": observations, "rew": rews, "dones": dones, "true_rew": true_rews, "vpred": vpreds,
70-
"ac": actions, "prevac": prev_actions, "nextvpred": vpred * (1 - new), "ep_rets": ep_rets,
71-
"ep_lens": ep_lens, "ep_true_rets": ep_true_rets, "total_timestep": current_it_timesteps}
68+
yield {
69+
"ob": observations,
70+
"rew": rews,
71+
"dones": dones,
72+
"true_rew": true_rews,
73+
"vpred": vpreds,
74+
"ac": actions,
75+
"prevac": prev_actions,
76+
"nextvpred": vpred[0] * (1 - done),
77+
"ep_rets": ep_rets,
78+
"ep_lens": ep_lens,
79+
"ep_true_rets": ep_true_rets,
80+
"total_timestep": current_it_timesteps
81+
}
7282
_, vpred, _, _ = policy.step(observation.reshape(-1, *observation.shape))
7383
# Be careful!!! if you change the downstream algorithm to aggregate
7484
# several of these batches, then be sure to do a deepcopy

0 commit comments

Comments
 (0)
Please sign in to comment.