import gym
import random
import numpy as np
from keras.layers import Dense, Activation
from keras.models import Sequential
from keras.optimizers import Adam
from gym.utils.step_api_compatibility import convert_to_done_step_api

env = gym.make('LunarLander-v2', render_mode='human')

def build_dqn(lr, num_of_actions, input_dims, first_layer_dims, second_layer_dims):
    model = Sequential([
                Dense(first_layer_dims, input_shape=(input_dims,)),
                Activation('relu'),
                Dense(second_layer_dims),
                Activation('relu'),
                Dense(num_of_actions)])
    
    model.compile(optimizer=Adam(learning_rate=lr), loss='mse')

    return model
    
def learn(transition, epsilon, epsilon_dec, epsilon_min, gamma, model):

    state = transition[0]
    action = transition[1]
    reward = transition[2]
    new_state = transition[3]
    done = transition[4]

    #target vypocitame z rewardu a predpovede hodnoty noveho stavu neskor v tejto funkcii len pre vykonanu akciu,
    #zvysne Q hodnoty budu predpovedou siete pre povodny stav 
    #pozor - model standardne vracia 2D numpy pole, selekciou nulteho prvku pola vyberame vnorene 1D pole s Q hodnotami
    #vstupom pre metodu musi byt taktiez 2D numpy pole, preto zabalime 1D pole state do hranatych zatvoriek 
    #aby sme ziskali 2D pythonovsky zoznam, ktory np.array() metodou konvertujeme na 2D numpy pole
    #print(np.shape(state))
    q_target = model.predict(np.array([state]), verbose=0)[0]

    #ak epizoda konci, dostane agent uz len reward, bez ocakavania dalsich odmien
    if not done:
        q_target[action] =  reward + gamma*np.amax(model.predict(np.array([new_state]), verbose=0)[0])
    else:
        q_target[action] = reward

    #gradient descent upravujúci váhy neurónovej siete pre zmenšenie chyby predpovede
    model.fit(np.array([state]), np.array([q_target]), verbose=0)
    
    #postupne zmensujeme epsilon, ak je vacsi ako min. povolena hodnota
    if epsilon > epsilon_min:
        epsilon = epsilon*epsilon_dec
            
    return epsilon

num_of_actions = 4
input_dims = 8

gamma = 0.99
learning_rate = 0.0005
first_layer_dims = 256
second_layer_dims = 256

epsilon = 1.0
epsilon_dec=0.996
epsilon_min=0.01

model = build_dqn(learning_rate, num_of_actions, input_dims, first_layer_dims, second_layer_dims)

eps_history = []
scores = []

print('Trenovanie sa zacalo:')

for episode in range(0,300):
    score = 0
    state, _ = env.reset()
    done = False
    frame = 0
    while not done:
        random_float = np.random.random()
        
        #mame pravdepodobnost=epsilon ze random_float bude < epsilon. Cim vacsi je epsilon,
        #tym vacsia je pravdepodobnost ze agent vykona nahodnu akciu namiesto greedy akcie
        if random_float < epsilon:
            action = random.randint(0,num_of_actions-1)
        else:
            q_values = model.predict(np.array([state]), verbose=0)
            action = np.argmax(q_values)
        
        new_state, reward, done, _ = convert_to_done_step_api(env.step(action))
        if frame == 600:
            done = True
            reward = -100
        
        transition = state, action, reward, new_state, done
        epsilon = learn(transition, epsilon, epsilon_dec, epsilon_min, gamma, model)

        state = new_state
        score = score + reward
        frame += 1
        
    eps_history.append(epsilon)
    scores.append(score)
    
    print('V epizode cislo ' + str(episode) + ' agent ziskal kumulativnu odmenu: ' + str(score) +
    ' a epsilon dosiahol hodnotu: ' + str(epsilon))

env.close()