QLearning论文
#导入包import numpy as npimport pandas as pdimport time
定义参数
np.random.seed(2)N_STATES=6 #最开始的间隔离保障的间隔ACTIONS=['left','right'] #行为EPSILON=0.9 #抉择动作概率,90% 的状况抉择最优动作.10%抉择随机动作ALPHA=0.1 # 学习效率LAMBDA=0.9 #将来处分衰减度,MAX_EPISODES=13 #回合数FRESH_TIME=0.3 #每步的工夫,为看成果
#构建QLabel,def build_q_table(n_state,actions): table=pd.DataFrame( np.zeros((n_state,len(actions))), columns=actions,) print(table) return table
#抉择动作def choice_action(state,q_tabel): #依据state,抉择行 state_action=q_tabel.iloc[state,:] #随机生成一个数,如果大于EPSILON 或者抉择的所有行为都是0 随机抉择一个行为 if(np.random.uniform()>EPSILON) or (state_action.all()==0): action_name=np.random.choice(ACTIONS) else: #抉择这一步中的较大值 action_name=ACTIONS[state_action.argmax()] return action_name
#创立环境和环境的feedback def get_env_feedback(S,A): if A=='right': if S==N_STATES-2: S_='terminal' R=1 else: S_=S+1 R=0 else: R=0 if S==0: S_=S else: S_=S-1 return S_,R
def update_env(S,episode,step_counter): env_list=['-']*(N_STATES-1)+['T'] if S=='terminal': interaction='Episode %s: total_steps= %s'%(str(episode+1),step_counter) print('\r{}'.format(interaction),end='') time.sleep(2) print('\r ',end='') else: env_list[S]='O' interaction=''.join(env_list) print('\r{}'.format(interaction),end='') time.sleep(FRESH_TIME)
def rl(): q_tabel=build_q_table(N_STATES,ACTIONS) for episode in range(MAX_EPISODES): step_counter=0 S=0 is_terminated=False update_env(S,episode,step_counter) while not is_terminated: A=choice_action(S,q_tabel) S_,R=get_env_feedback(S,A) #估计值 q_predict=q_tabel.loc[S,A] if S_!='terminal': #实在值 q_target=R+LAMBDA*q_tabel.iloc[S_,:].max() else: q_target=R is_terminated=True q_tabel.loc[S,A]+=ALPHA*(q_target-q_predict) S=S_ update_env(S,episode,step_counter+1) step_counter+=1 return q_tabel
if __name__ == '__main__': q_tabel=rl() print('\r\n Q-tabel:\n') print(q_tabel)