from tabnanny import verbose
import gym
import random
import numpy as np
from keras.layers import Dense, Activation
from keras.models import Sequential
from keras.optimizers import Adam
import collections
from gym.utils.step_api_compatibility import convert_to_done_step_api

env = gym.make('LunarLander-v2', render_mode='human')

def load_memory_batch(memory, batch_size):
    batch = []
    for _ in range(1, batch_size):
        batch.append(random.choice(memory))
    return batch

def build_dqn(lr, num_of_actions, input_dims, first_layer_dims, second_layer_dims):
    model = Sequential([
                Dense(first_layer_dims, input_shape=(input_dims,)),
                Activation('relu'),
                Dense(second_layer_dims),
                Activation('relu'),
                Dense(num_of_actions)])
    
    model.compile(optimizer=Adam(learning_rate=lr), loss='mse')

    return model
    
def learn(memory, batch_size, epsilon, epsilon_dec, epsilon_min, gamma, model):

    #uci sa len ak je v pamati uz viac ulozenych prechodov, ako je velkost batchu vzoriek (prechodov), z ktorych sa ma ucit
    if len(memory) > batch_size:
    
        batch = load_memory_batch(memory, batch_size)
        batch_states = []
        batch_new_states = []
        
        batch_q_targets = []
        batch_q_values_newstate = []
        
        #batch je pole poli, kazdy transition je vnorene pole v batchi ktore obsahuje
        #stav, akciu, odmenu, novy stav a "done"
        
        for transition in batch:
            batch_states.append(transition[0])
            batch_new_states.append(transition[3])        
    
        batch_q_targets = model.predict(np.array(batch_states), verbose=0)
        batch_q_values_newstate = model.predict(np.array(batch_new_states), verbose=0)
        
        for counter, transition in enumerate(batch):
            
            action = transition[1]
            reward = transition[2]
            
            done = transition[4]

            #target vypocitame z rewardu a predpovede hodnoty noveho stavu neskor v tejto funkcii len pre vykonanu akciu,
            #zvysne Q hodnoty budu predpovedou siete pre povodny stav 
            
            q_target = batch_q_targets[counter]
            
            #ak epizoda konci, dostane agent uz len reward, bez ocakavania dalsich odmien
            if not done:
                q_target[action] =  reward + gamma*np.amax(batch_q_values_newstate[counter])
            else:
                q_target[action] = reward
                
            batch_q_targets[counter] = q_target

        #gradient descent upravujúci váhy neurónovej siete pre zmenšenie chyby predpovede
        model.fit(np.array(batch_states), np.array(batch_q_targets), verbose=0)
        
        #postupne zmensujeme epsilon, ak je vacsi ako min. povolena hodnota
        if epsilon > epsilon_min:
            epsilon = epsilon*epsilon_dec
                
    return epsilon
  

num_of_actions = 4
input_dims = 8

gamma = 0.99
learning_rate = 0.0005
first_layer_dims = 256
second_layer_dims = 256

epsilon = 1.0
epsilon_dec=0.996
epsilon_min=0.01

batch_size = 64

model = build_dqn(learning_rate, num_of_actions, input_dims, first_layer_dims, second_layer_dims)

eps_history = []
scores = []

print('Trenovanie sa zacalo:')

memory = collections.deque(maxlen=60000)

for episode in range(0,300):
    score = 0
    state, _ = env.reset()
    done = False
    while not done:
        #env.render()

        random_float = np.random.random()
        
        #mame pravdepodobnost=epsilon ze random_float bude < epsilon. Cim vacsi je epsilon,
        #tym vacsia je pravdepodobnost ze agent vykona nahodnu akciu namiesto greedy akcie
        if random_float < epsilon:
            action = np.random.randint(0, num_of_actions)
        else:
            q_values = model.predict(np.array([state]), verbose=0)
            action = np.argmax(q_values)
        
        new_state, reward, done, _ = convert_to_done_step_api(env.step(action))
        
        transition = state, action, reward, new_state, done
        memory.append(transition)
        
        epsilon = learn(memory, batch_size, epsilon, epsilon_dec, epsilon_min, gamma, model)

        state = new_state
        score = score + reward
        
    eps_history.append(epsilon)
    scores.append(score)
    
    print('V epizode cislo ' + str(episode+1) + ' agent ziskal kumulativnu odmenu: ' + str(score) +
    ' a epsilon dosiahol hodnotu: ' + str(epsilon))

env.close()