QLearning论文

#导入包import numpy as npimport pandas as pdimport time

定义参数

np.random.seed(2)N_STATES=6 #最开始的间隔离保障的间隔ACTIONS=['left','right'] #行为EPSILON=0.9 #抉择动作概率,90% 的状况抉择最优动作.10%抉择随机动作ALPHA=0.1 # 学习效率LAMBDA=0.9 #将来处分衰减度,MAX_EPISODES=13 #回合数FRESH_TIME=0.3 #每步的工夫,为看成果

#构建QLabel,def build_q_table(n_state,actions):    table=pd.DataFrame(    np.zeros((n_state,len(actions))),    columns=actions,)    print(table)    return table

#抉择动作def choice_action(state,q_tabel):    #依据state,抉择行    state_action=q_tabel.iloc[state,:]    #随机生成一个数,如果大于EPSILON 或者抉择的所有行为都是0  随机抉择一个行为    if(np.random.uniform()>EPSILON) or (state_action.all()==0):        action_name=np.random.choice(ACTIONS)    else:        #抉择这一步中的较大值        action_name=ACTIONS[state_action.argmax()]    return action_name

#创立环境和环境的feedback def get_env_feedback(S,A):    if A=='right':        if S==N_STATES-2:            S_='terminal'            R=1        else:            S_=S+1            R=0    else:        R=0        if S==0:            S_=S        else:            S_=S-1    return S_,R

def update_env(S,episode,step_counter):    env_list=['-']*(N_STATES-1)+['T']    if S=='terminal':        interaction='Episode %s: total_steps= %s'%(str(episode+1),step_counter)        print('\r{}'.format(interaction),end='')        time.sleep(2)        print('\r                       ',end='')    else:        env_list[S]='O'        interaction=''.join(env_list)        print('\r{}'.format(interaction),end='')        time.sleep(FRESH_TIME)

def rl():    q_tabel=build_q_table(N_STATES,ACTIONS)    for episode in range(MAX_EPISODES):        step_counter=0        S=0        is_terminated=False        update_env(S,episode,step_counter)        while not is_terminated:            A=choice_action(S,q_tabel)            S_,R=get_env_feedback(S,A)            #估计值            q_predict=q_tabel.loc[S,A]            if S_!='terminal':                #实在值                q_target=R+LAMBDA*q_tabel.iloc[S_,:].max()            else:                q_target=R                is_terminated=True            q_tabel.loc[S,A]+=ALPHA*(q_target-q_predict)            S=S_            update_env(S,episode,step_counter+1)            step_counter+=1    return q_tabel

if __name__ == '__main__':    q_tabel=rl()    print('\r\n Q-tabel:\n')    print(q_tabel)