Demo entry 6721724

dqn

   

Submitted by anonymous on Mar 22, 2018 at 13:01
Language: Python 3. Code size: 7.6 kB.

# -*- coding: utf-8 -*-
"""
Created on Mon Mar 19 14:18:57 2018

@author: ZXJ
"""

import tensorflow as tf
import numpy as np
import random
from collections import deque
import cvxopt

# Hyper Parameters for DQN
GAMMA = 0.9
LEARNING_RATE = 0.01
INITIAL_EPSILON = 0.7
FINAL_EPSILON = 0.01
EXPLORE = 20000
MEMORY_SIZE = 500
BATCH_SIZE = 32
H1_NEURO = 16
H2_NEURO = 8
EPISODE = 10000
STEP = 500

# Parameters for RRHs
P_ACTIVE = 6.8
P_SLEEP = 4.3
P_TRANS = 3.0
AMP_EFF = 0.25


class DQN():     
     def __init__(self):
          self.replay_buffer = deque()
          self.epsilon = INITIAL_EPSILON
          self.state_num = 5
          self.action_num = 3
          self.state_list = np.array([[0,0,0,0.1,0.1],
                                      [0,0,1,0.1,0.1],
                                      [0,1,0,0.1,0.1],
                                      [0,1,1,0.1,0.1],
                                      [1,0,0,0.1,0.1],
                                      [1,0,1,0.1,0.1],
                                      [1,1,0,0.1,0.1],
                                      [1,1,1,0.1,0.1],
                                      ])
          self.action_list = np.identity(self.action_num)          
          self.loss_his = []
          self.step = 0
          
          self.creat_network()
          
          self.session = tf.InteractiveSession()
          self.session.run(tf.global_variables_initializer())
          
     def creat_network(self):
          # input
          self.state_input = tf.placeholder(dtype=tf.float32, shape=[None,self.state_num])
          self.action_input = tf.placeholder(dtype=tf.float32, shape=[None,self.action_num])
          
          # weights and bias
          w1 = tf.Variable(tf.random_normal([self.state_num,H1_NEURO]))
          b1 = tf.Variable(tf.zeros([1,H1_NEURO])+0.1)
          h1 = tf.nn.relu(tf.matmul(self.state_input,w1)+b1)
          
          w2 = tf.Variable(tf.random_normal([H1_NEURO,H2_NEURO]))
          b2 = tf.Variable(tf.zeros([1,H2_NEURO])+0.1)
          h2 = tf.nn.relu(tf.matmul(h1,w2)+b2)
          
          w3 = tf.Variable(tf.random_normal([H2_NEURO,self.action_num]))
          b3 = tf.Variable(tf.zeros([1,self.action_num])+0.1)
          self.q_value = tf.matmul(h2,w3)+b3
          
          # loss
          self.q_target = tf.placeholder(dtype=tf.float32, shape=[None])
          self.q_action = tf.reduce_sum(tf.multiply(self.q_value,self.action_input),reduction_indices=1)
          self.loss = tf.reduce_mean(tf.square(self.q_target-self.q_action))
          self.optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(self.loss)
          
     def egreedy_action(self,state):
          """
          e-greedy策略做随机动作选择
          注意返回的其实是该动作在动作空间中的索引
          """
          if np.random.uniform()<self.epsilon:
               action_index = np.random.randint(0,self.action_num)
          else:
               q_value = self.session.run(self.q_value,feed_dict={
                         self.state_input:state})
               action_index = np.argmax(q_value)
               
          if self.epsilon > FINAL_EPSILON:
               self.epsilon -= (INITIAL_EPSILON-FINAL_EPSILON)/EXPLORE
          
          #print(action_index)
          return action_index
     
     def best_action(self,state):
          """
          测试时直接选择最好的动作,不做随机选择
          """
          q_value = self.session.run(self.q_value,feed_dict={
                    self.state_input:state})
          action_index = np.argmax(q_value)
          return action_index
     
     def excute_action(self,state,action_index):
          """
          执行动作,获取下一个状态、奖励等信息
          """      
          # next state after excuting action
          current_state = state
          current_state[0][action_index] = 1 - state[0][action_index]
          #print(state)
          next_state = current_state
          #print(next_state.shape)
          # set of active RRHs
          active_RRH = np.reshape(next_state[0][0:3],[1,3])
          #print(next_state[0][0:3])
          #print(active_RRH.shape)
          
          # CP-Beamforming Optimization
          # get beamforming weights w_r,u
          self.beam_opt(active_RRH)
          
          # immediate reward
          w_ru = np.random.normal(1,1,[3,2])
          #print(w_ru)
          p_max = tf.constant([10.0], dtype=tf.float32)
          p_act = 1/AMP_EFF*tf.reduce_sum(tf.matmul(active_RRH,np.multiply(w_ru,w_ru)))
          p_actual = tf.to_float(p_act)
          #print(p_actual)          

          reward = p_max - p_actual
          #print(p_max, p_actual, reward)
          
          return next_state, reward
     
     def store_memory(self,state,action_index,reward,next_state):
          """
          记忆存储
          """
          self.step += 1
          
          action = self.action_list[action_index]
          
          self.replay_buffer.append((state,action,reward,next_state))
          
          if len(self.replay_buffer) > MEMORY_SIZE:
               self.replay_buffer.popleft() 
               
     def beam_opt(self,active_RRH):
          """
          cvxopt求解二阶锥优化问题
          """
          pass
     
     def train_network(self):
          """
          经验回放,从记忆池取样本训练Q网络
          """
          
          # obtain random minibatch from memory
          minibatch = random.sample(self.replay_buffer,BATCH_SIZE)
          #print(minibatch)
          state_batch = np.vstack([data[0] for data in minibatch])
          action_batch = np.vstack([data[1] for data in minibatch])
          reward_batch = np.vstack([data[2] for data in minibatch])
          next_state_batch = np.vstack([data[3] for data in minibatch])
          #print("next_state_batch:", next_state_batch)
          
          # calculate y_target
          y_target = []
          q_next = self.session.run([self.q_value],feed_dict={
                    self.state_input : next_state_batch})
          #print(q_next)
          for i in range(BATCH_SIZE):
               current_reward = reward_batch[i][0]
               q_value = current_reward + GAMMA*(np.max(q_next[0][i]))
               y_target.append(q_value)
          
          print(state_batch)
          print(action_batch)
          print(y_target)
          
          # update DNN by loss function
          _, loss = self.session.run([self.optimizer,self.loss],feed_dict={
                              self.state_input:state_batch,
                              self.action_input:action_batch,
                              self.q_target:y_target})
     
          # history of loss value
          self.loss_his.append(loss)
          if self.step % 1000 ==0:
               print("loss:", loss)
          
     def train(self):
          for episode in range(EPISODE):
               state_index = np.random.randint(0,8)
               state =np.reshape(self.state_list[state_index],(1,5))
               
               #print(state)
               for step in range(STEP):
                    action_index = self.egreedy_action(state)
                    next_state, reward = self.excute_action(state,action_index)
                    self.store_memory(state,action_index,reward,next_state)
                    if self.step > 32:
                         self.train_network()                    
                    state = next_state
               
if __name__ == '__main__':
     dqn = DQN()
     dqn.train()

This snippet took 0.02 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).