forked from ZikangXiong/VRL_CodeReview
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathoscillator10.py
More file actions
119 lines (99 loc) · 3.53 KB
/
oscillator10.py
File metadata and controls
119 lines (99 loc) · 3.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from main import *
from shield import Shield
from Environment import PolySysEnvironment
from DDPG import *
# Show that there is an invariant that can prove the policy safe
def oscillator(learning_method, number_of_rollouts, simulation_steps, learning_eposides, critic_structure, actor_structure, train_dir):
# 10-dimension and 1-input system and 1-disturbance system
ds = 10
us = 2
#Dynamics that are defined as a continuous function!
def f (x, u):
#random disturbance
#d = random.uniform(0, 20)
delta = np.zeros((ds, 1), float)
delta[0,0] = -2*x[0,0] +u[0,0]
delta[1,0] = -x[1,0] + u[1,0]
delta[2,0] = 5*x[0,0] - 5*x[2,0]
delta[3,0] = 5*x[2,0] - 5*x[3,0]
delta[4,0] = 5*x[3,0] - 5*x[4,0]
delta[5,0] = 5*x[4,0] - 5*x[5,0]
delta[6,0] = 5*x[5,0] - 5*x[6,0]
delta[7,0] = 5*x[6,0] - 5*x[7,0]
delta[8,0] = 5*x[7,0] - 5*x[8,0]
delta[9,0] = 5*x[8,0] - 5*x[9,0]
return delta
#Closed loop system dynamics to text
def f_to_str(K):
kstr = K_to_str(K)
f = []
f.append("-2*x[1] + {}".format(kstr[0]))
f.append("-x[2] + {}".format(kstr[1]))
f.append("5*x[1]-5*x[3]")
f.append("5*x[3]-5*x[4]")
f.append("5*x[4]-5*x[5]")
f.append("5*x[5]-5*x[6]")
f.append("5*x[6]-5*x[7]")
f.append("5*x[7]-5*x[8]")
f.append("5*x[8]-5*x[9]")
f.append("5*x[9]-5*x[10]")
return f
h = 0.01
# amount of Gaussian noise in dynamics
eq_err = 1e-2
#intial state space
s_min = np.array([[0.2],[-0.1], [0], [0], [0], [0], [0], [0], [0], [0]])
s_max = np.array([[0.3],[ 0.1], [0], [0], [0], [0], [0], [0], [0], [0]])
Q = np.zeros((ds,ds), float)
R = np.zeros((us,us), float)
np.fill_diagonal(Q, 1)
np.fill_diagonal(R, 1)
#user defined unsafety condition
def unsafe_eval(x):
if (x[9,0] >= 0.05):
return True
return False
def unsafe_string():
return ["x[10] - 0.05"]
def rewardf(x, Q, u, R):
reward = 0
reward += -np.dot(x.T,Q.dot(x))-np.dot(u.T,R.dot(u))
if (unsafe_eval(x)):
reward -= 100
return reward
def testf(x, u):
if (unsafe_eval(x)):
print x
return -1
return 0
u_min = np.array([[-50.], [-50]])
u_max = np.array([[ 50.], [ 50]])
env = PolySysEnvironment(f, f_to_str,rewardf, testf, unsafe_string, ds, us, Q, R, s_min, s_max, u_max=u_max, u_min=u_min, timestep=h)
############ Train and Test NN model ############
args = { 'actor_lr': 0.0001,
'critic_lr': 0.001,
'actor_structure': actor_structure,
'critic_structure': critic_structure,
'buffer_size': 1000000,
'gamma': 0.99,
'max_episode_len':64,
'max_episodes': learning_eposides,
'minibatch_size': 20,
'random_seed': 6553,
'tau': 0.005,
'model_path': train_dir+"model.chkp",
'enable_test': True,
'test_episodes': 100,
'test_episodes_len': 1000}
actor = DDPG(env, args=args)
#actor_boundary(env, actor, 1000, 100)
#################### Shield #################
model_path = os.path.split(args['model_path'])[0]+'/'
linear_func_model_name = 'K.model'
model_path = model_path+linear_func_model_name+'.npy'
shield = Shield(env, actor, model_path=model_path, force_learning=False)
shield.train_polysys_shield(learning_method, number_of_rollouts, simulation_steps, eq_err=eq_err, explore_mag = 0.4, step_size = 0.5)
shield.test_shield(100, 1000, mode="single")
#shield.test_shield(100, 1000, mode="all")
actor.sess.close()
oscillator("random_search", 200, 200, 0, [240, 200], [280, 240, 200], "ddpg_chkp/oscillator/10/240200280240200/")