QLearning 论文
# 导入包
import numpy as np
import pandas as pd
import time
定义参数
np.random.seed(2)
N_STATES=6 #最开始的间隔离保障的间隔
ACTIONS=['left','right'] #行为
EPSILON=0.9 #抉择动作概率,90% 的状况抉择最优动作.10% 抉择随机动作
ALPHA=0.1 # 学习效率
LAMBDA=0.9 #将来处分衰减度,
MAX_EPISODES=13 #回合数
FRESH_TIME=0.3 #每步的工夫, 为看成果
# 构建 QLabel,
def build_q_table(n_state,actions):
table=pd.DataFrame(np.zeros((n_state,len(actions))),
columns=actions,)
print(table)
return table
# 抉择动作
def choice_action(state,q_tabel):
#依据 state, 抉择行
state_action=q_tabel.iloc[state,:]
#随机生成一个数, 如果大于 EPSILON 或者抉择的所有行为都是 0 随机抉择一个行为
if(np.random.uniform()>EPSILON) or (state_action.all()==0):
action_name=np.random.choice(ACTIONS)
else:
#抉择这一步中的较大值
action_name=ACTIONS[state_action.argmax()]
return action_name
# 创立环境和环境的 feedback
def get_env_feedback(S,A):
if A=='right':
if S==N_STATES-2:
S_='terminal'
R=1
else:
S_=S+1
R=0
else:
R=0
if S==0:
S_=S
else:
S_=S-1
return S_,R
def update_env(S,episode,step_counter):
env_list=['-']*(N_STATES-1)+['T']
if S=='terminal':
interaction='Episode %s: total_steps= %s'%(str(episode+1),step_counter)
print('\r{}'.format(interaction),end='')
time.sleep(2)
print('\r',end='')
else:
env_list[S]='O'
interaction=''.join(env_list)
print('\r{}'.format(interaction),end='')
time.sleep(FRESH_TIME)
def rl():
q_tabel=build_q_table(N_STATES,ACTIONS)
for episode in range(MAX_EPISODES):
step_counter=0
S=0
is_terminated=False
update_env(S,episode,step_counter)
while not is_terminated:
A=choice_action(S,q_tabel)
S_,R=get_env_feedback(S,A)
#估计值
q_predict=q_tabel.loc[S,A]
if S_!='terminal':
#实在值
q_target=R+LAMBDA*q_tabel.iloc[S_,:].max()
else:
q_target=R
is_terminated=True
q_tabel.loc[S,A]+=ALPHA*(q_target-q_predict)
S=S_
update_env(S,episode,step_counter+1)
step_counter+=1
return q_tabel
if __name__ == '__main__':
q_tabel=rl()
print('\r\n Q-tabel:\n')
print(q_tabel)