• SARSA

    스크린샷 2021-10-14 오후 12.29.47.png

    import random
    import numpy as np
    
    class GridWorld():
        def __init__(self):
            self.x=0
            self.y=0
        
        def step(self, a):
            # 0번 액션: 왼쪽, 1번 액션: 위, 2번 액션: 오른쪽, 3번 액션: 아래쪽
            if a==0:
                self.move_left()
            elif a==1:
                self.move_up()
            elif a==2:
                self.move_right()
            elif a==3:
                self.move_down()
    
            reward = -1  # 보상은 항상 -1로 고정
            done = self.is_done()
            return (self.x, self.y), reward, done
    
        def move_left(self):
            if self.y==0:
                pass
            elif self.y==3 and self.x in [0,1,2]:
                pass
            elif self.y==5 and self.x in [2,3,4]:
                pass
            else:
                self.y -= 1
    
        def move_right(self):
            if self.y==1 and self.x in [0,1,2]:
                pass
            elif self.y==3 and self.x in [2,3,4]:
                pass
            elif self.y==6:
                pass
            else:
                self.y += 1
          
        def move_up(self):
            if self.x==0:
                pass
            elif self.x==3 and self.y==2:
                pass
            else:
                self.x -= 1
    
        def move_down(self):
            if self.x==4:
                pass
            elif self.x==1 and self.y==4:
                pass
            else:
                self.x+=1
    
        def is_done(self):
            if self.x==4 and self.y==6: # 목표 지점인 (4,6)에 도달하면 끝난다
                return True
            else:
                return False
          
        def reset(self):
            self.x = 0
            self.y = 0
            return (self.x, self.y)
    
    class QAgent():
        def __init__(self):
            self.q_table = np.zeros((5, 7, 4)) # 마찬가지로 Q 테이블을 0으로 초기화
            self.eps = 0.9
    
        def select_action(self, s):
            # eps-greedy로 액션을 선택해준다
            x, y = s
            coin = random.random()
            if coin < self.eps:
                action = random.randint(0,3)
            else:
                action_val = self.q_table[x,y,:]
                action = np.argmax(action_val)
            return action
    
        def update_table(self, transition):
            s, a, r, s_prime = transition
            x,y = s
            next_x, next_y = s_prime
            a_prime = self.select_action(s_prime) # S'에서 선택할 액션 (실제로 취한 액션이 아님)
            # SARSA 업데이트 식을 이용
            self.q_table[x,y,a] = self.q_table[x,y,a] + 0.1 * (r + self.q_table[next_x,next_y,a_prime] - self.q_table[x,y,a])
    
        def anneal_eps(self):
            self.eps -= 0.03
            self.eps = max(self.eps, 0.1)
    
        def show_table(self):
            q_lst = self.q_table.tolist()
            data = np.zeros((5,7))
            for row_idx in range(len(q_lst)):
                row = q_lst[row_idx]
                for col_idx in range(len(row)):
                    col = row[col_idx]
                    action = np.argmax(col)
                    data[row_idx, col_idx] = action
            print(data)
    
          
    def main():
        env = GridWorld()
        agent = QAgent()
    
        for n_epi in range(1000):
            done = False
    
            s = env.reset()
            while not done:
                a = agent.select_action(s)
                s_prime, r, done = env.step(a)
                agent.update_table((s,a,r,s_prime))
                s = s_prime
            agent.anneal_eps()
    
        agent.show_table()
    
    if __name__ == '__main__':
        main()
    
  • Q-Learning

    • Off policy = > Target Policy ≠ Behavior Policy

      스크린샷 2021-10-14 오후 4.50.00.png

    import random
    import numpy as np
    
    class GridWorld():
        def __init__(self):
            self.x=0
            self.y=0
        
        def step(self, a):
            # 0번 액션: 왼쪽, 1번 액션: 위, 2번 액션: 오른쪽, 3번 액션: 아래쪽
            if a==0:
                self.move_left()
            elif a==1:
                self.move_up()
            elif a==2:
                self.move_right()
            elif a==3:
                self.move_down()
    
            reward = -1 # 보상은 항상 -1로 고정
            done = self.is_done()
            return (self.x, self.y), reward, done
    
        def move_left(self):
            if self.y==0:
                pass
            elif self.y==3 and self.x in [0,1,2]:
                pass
            elif self.y==5 and self.x in [2,3,4]:
                pass
            else:
                self.y -= 1
    
        def move_right(self):
            if self.y==1 and self.x in [0,1,2]:
                pass
            elif self.y==3 and self.x in [2,3,4]:
                pass
            elif self.y==6:
                pass
            else:
                self.y += 1
          
        def move_up(self):
            if self.x==0:
                pass
            elif self.x==3 and self.y==2:
                pass
            else:
                self.x -= 1
    
        def move_down(self):
            if self.x==4:
                pass
            elif self.x==1 and self.y==4:
                pass
            else:
                self.x+=1
    
        def is_done(self):
            if self.x==4 and self.y==6:
                return True
            else:
                return False
          
        def reset(self):
            self.x = 0
            self.y = 0
            return (self.x, self.y)
    
    class QAgent():
        def __init__(self):
            self.q_table = np.zeros((5, 7, 4)) # 마찬가지로 Q 테이블을 0으로 초기화
            self.eps = 0.9
    
        def select_action(self, s):
            # eps-greedy로 액션을 선택해준다
            x, y = s
            coin = random.random()
            if coin < self.eps:
                action = random.randint(0,3)
            else:
                action_val = self.q_table[x,y,:]
                action = np.argmax(action_val)
            return action
    
        def update_table(self, transition):
            s, a, r, s_prime = transition
            x,y = s
            next_x, next_y = s_prime
            a_prime = self.select_action(s_prime) # S'에서 선택할 액션 (실제로 취한 액션이 아님)
            # Q러닝 업데이트 식을 이용 
            self.q_table[x,y,a] = self.q_table[x,y,a] + 0.1 * (r + np.amax(self.q_table[next_x,next_y,:]) - self.q_table[x,y,a])
    
        def anneal_eps(self):
            self.eps -= 0.01  # Q러닝에선 epsilon 이 좀더 천천히 줄어 들도록 함.
            self.eps = max(self.eps, 0.2) 
    
        def show_table(self):
            q_lst = self.q_table.tolist()
            data = np.zeros((5,7))
            for row_idx in range(len(q_lst)):
                row = q_lst[row_idx]
                for col_idx in range(len(row)):
                    col = row[col_idx]
                    action = np.argmax(col)
                    data[row_idx, col_idx] = action
            print(data)
          
    
    def main():
        env = GridWorld()
        agent = QAgent()
    
        for n_epi in range(1000):
            done = False
    
            s = env.reset()
            while not done:
                a = agent.select_action(s)
                s_prime, r, done = env.step(a)
                agent.update_table((s,a,r,s_prime))
                s = s_prime
            agent.anneal_eps()
    
        agent.show_table()
    
    if __name__ == '__main__':
        main()