diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 485dee64bcfb48793379b200a1afd14e85a8aaf4..0000000000000000000000000000000000000000 --- a/.gitignore +++ /dev/null @@ -1 +0,0 @@ -.idea diff --git a/data/1/True/2/bn_belief_caregiver_assistive_action.pkl b/data/1/True/2/bn_belief_caregiver_assistive_action.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/data/1/True/2/bn_belief_caregiver_feedback_action.pkl b/data/1/True/2/bn_belief_caregiver_feedback_action.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/data/1/True/2/bn_belief_user_action.pkl b/data/1/True/2/bn_belief_user_action.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/data/1/True/2/bn_belief_user_react_time.pkl b/data/1/True/2/bn_belief_user_react_time.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/data/1/True/2/bn_variables.csv b/data/1/True/2/bn_variables.csv new file mode 100644 index 0000000000000000000000000000000000000000..7e20b0db59badcf8618560e29358107ca00a7190 --- /dev/null +++ b/data/1/True/2/bn_variables.csv @@ -0,0 +1,8 @@ +user_react_time,game_state,attempt,agent_feedback,user_memory,user_action,agent_assistance,user_reactivity +2,0,0,0,0,0,0,0 +1,0,0,0,0,0,1,0 +1,1,0,0,0,0,1,0 +0,1,0,0,0,2,0,0 +0,1,1,0,0,0,2,0 +0,2,0,0,0,2,0,0 +1,2,1,0,0,0,1,0 diff --git a/data/1/True/2/log_gen.csv b/data/1/True/2/log_gen.csv new file mode 100644 index 0000000000000000000000000000000000000000..3f65a302f3538069245923325a78b16049f62b5f --- /dev/null +++ b/data/1/True/2/log_gen.csv @@ -0,0 +1,6 @@ +cum_elapsed_time,to,cum_react_time,from,timeout,user_action,token_id,caregiver_feedback,avg_caregiver_assistance_per_move,attempt +7.621,1,0.0,13,0,,289,0,0.0,1 +0.71,2,7.663,8,0,,311,0,0.0,1 +1.738,3,5.253,15,0,,321,0,0.0,1 +0.931,4,10.58,20,1,,499,0,0.0,2 +0.826,5,6.484,12,2,,537,0,0.0,2 diff --git a/data/1/True/2/log_params.csv b/data/1/True/2/log_params.csv new file mode 100644 index 0000000000000000000000000000000000000000..0d7bce0952cc4e3404c5c391dff84a65baa57a47 --- /dev/null +++ b/data/1/True/2/log_params.csv @@ -0,0 +1,2 @@ +with_feedback,objective,session,user_id,timeout +True,ascending_odd,2,1,15 diff --git a/data/1/True/2/log_spec.csv b/data/1/True/2/log_spec.csv new file mode 100644 index 0000000000000000000000000000000000000000..04b71db330e4d1da0afbdd2e1b2eb5ec33d7d7d0 --- /dev/null +++ b/data/1/True/2/log_spec.csv @@ -0,0 +1,8 @@ +game_state,user_action,attempt,caregiver_feedback,react_time,caregiver_assistance,user_react_time,from,elapsed_time,to,timeout,token_id +0,0,1,0,0.0,0,2,13,7.621,1,0,289 +0,0,1,0,7.663,0,1,8,0.71,2,0,311 +1,0,1,0,5.253,0,1,15,1.738,3,0,321 +1,2,1,0,15,0,0,,0,,0, +1,0,2,0,10.58,0,0,20,0.931,4,1,499 +2,2,1,0,15,0,0,,0,,1, +2,0,2,0,6.484,0,1,12,0.826,5,2,537 diff --git a/data/1/True/2/log_summary.csv b/data/1/True/2/log_summary.csv new file mode 100644 index 0000000000000000000000000000000000000000..d2ff67021636daca58d28ab08d7452c02811ddfb --- /dev/null +++ b/data/1/True/2/log_summary.csv @@ -0,0 +1,2 @@ +tot_elapsed_time,avg_lev_assistance,tot_react_time,n_timeout,n_attempt,n_sociable +11.826,0.0,29.980000000000004,3,7,0 diff --git a/main.py b/main.py index 09624e73702b6ccca5c7dac1070bee0aa4de4bd4..01e168b6839a9b9eda52002848a69f28d2bad090 100644 --- a/main.py +++ b/main.py @@ -22,9 +22,9 @@ import itertools import os import math import operator -import datetime import pickle import bnlearn +import argparse from cognitive_game_env import CognitiveGame from episode import Episode @@ -108,10 +108,10 @@ def get_entropy(policies, state_space, action_space): # highlight high and low entropy states entropy_sort = sorted(entropy.items(), key=operator.itemgetter(1)) - s_preferences = [entropy_sort[i][0] for i in range(-1, -6, -1)] - s_constraints = [entropy_sort[i][0] for i in range(5)] + s_constraints = [entropy_sort[i][0] for i in range(-1, -6, -1)] + s_preferences = [entropy_sort[i][0] for i in range(5)] - return s_preferences, s_constraints + return s_constraints, s_preferences def maxent(world, terminal, trajectories): @@ -125,6 +125,7 @@ def maxent(world, terminal, trajectories): estimation of the reward based on the MEIRL """ # set up features: we use one feature vector per state + #features = world.state_features() features = world.assistive_feature(trajectories) # choose our parameter initialization strategy: @@ -136,7 +137,8 @@ def maxent(world, terminal, trajectories): optim = O.ExpSga(lr=O.linear_decay(lr0=0.1)) # actually do some inverse reinforcement learning - reward = M.irl(world.p_transition, features, terminal, trajectories, optim, init) + reward = M.irl_causal(world.p_transition, features, terminal, trajectories, optim, init, discount=0.1) + #reward = M.irl(world.p_transition, features, terminal, trajectories, optim, init) return reward @@ -157,22 +159,28 @@ def merge_agent_policy(policy_from_data, policy_from_therapist): merged_policy[index] merged_policy[index] = list(map(lambda x:sum(x), )) -def merge_user_log(folder_pathname, user_id, with_feedback, column_to_remove): - absolute_path = folder_pathname+"/"+str(+user_id)+"/"+str(with_feedback) +def merge_user_log(tpi_folder_pathname, file_output, user_id, with_feedback, rpi_folder_pathname=None, column_to_remove=None): + tpi_absolute_path = tpi_folder_pathname+"/"+str(+user_id)+"/"+str(with_feedback) + if rpi_folder_pathname!=None: + rpi_absolute_path = rpi_folder_pathname+"/"+str(+user_id)+"/"+str(with_feedback) + else: + rpi_absolute_path=None + + episodes_length, tpi_episode_length, rpi_episode_length = [], [], [] + df = pd.DataFrame() - if len(absolute_path)==0: - print("Error no folders in path ", absolute_path) - # else: - # df = pd.read_csv(absolute_path+"/1/bn_variables.csv") + if len(tpi_absolute_path)==0: + print("Error no folders in path ", tpi_absolute_path) if column_to_remove!=None: df = df.drop(column_to_remove, axis=1) #df_removed = df.drop(["user_memory", "user_reactivity"], axis=1) - sessions_directory = os.listdir(absolute_path) - episode_length = [0]*(len(sessions_directory)+1) + tpi_sessions_directory = os.listdir(tpi_absolute_path) + tpi_episode_length = [0]*(len(tpi_sessions_directory)+1) - for i in range(len(sessions_directory)): - file_folder = absolute_path+"/"+sessions_directory[i] + files = None + for i in range(len(tpi_sessions_directory)): + file_folder = tpi_absolute_path+"/"+tpi_sessions_directory[i] print("File folder: ", file_folder) files = os.listdir(file_folder) @@ -180,27 +188,50 @@ def merge_user_log(folder_pathname, user_id, with_feedback, column_to_remove): if files[k] == "bn_variables.csv": df_ = pd.read_csv(file_folder+"/"+files[k]) df = df.append(df_) - episode_length[i+1] = episode_length[i]+(df_.shape[0]-1)+1 - df.to_csv(absolute_path + "/summary_bn_variables.csv", index=False) - return df, episode_length + tpi_episode_length[i+1] = tpi_episode_length[i]+(df_.shape[0]-1)+1 + + if rpi_folder_pathname!=None and len(rpi_absolute_path)!=None: + rpi_sessions_directory = os.listdir(rpi_absolute_path) + rpi_episode_length = [0] * len(rpi_sessions_directory) + + files = None + for i in range(len(rpi_sessions_directory)): + file_folder = rpi_absolute_path + "/" + rpi_sessions_directory[i] + print("File folder: ", file_folder) + files = os.listdir(file_folder) + + for k in range(len(files)): + if files[k] == "bn_variables.csv": + df_ = pd.read_csv(file_folder + "/" + files[k]) + df = df.append(df_) + rpi_episode_length[i] = (sum(rpi_episode_length)+ (df_.shape[0] - 1) + 1) + rpi_episode_length = [rpi_episode_length[i]+tpi_episode_length[-1] for i in range(len(rpi_episode_length))] + else: + print("You are not considering the data collected from the interaction with the robot") + + + df.to_csv(file_output, index=False) + episodes_length = tpi_episode_length+rpi_episode_length + + return df, (episodes_length) -def compute_agent_policy(folder_pathname, user_id, with_feedback, state_space, action_space, episode_length): +def compute_agent_policy(training_set_filename, state_space, action_space, episode_length): #read columns of interest (game_state, attempt, user_prev_action) ep = Episode() - df = pd.read_csv(folder_pathname+"/"+str(user_id)+"/"+str(with_feedback)+"/summary_bn_variables.csv") + df = pd.read_csv(training_set_filename) agent_policy_counter = [[0 for a in action_space] for s in state_space] agent_policy_prob = [[0 for a in action_space] for s in state_space] row_t_0 = 0 for index, row in df.iterrows(): if index == 0 or index in episode_length: - state_point = (row['game_state'], row['attempt'], 0) + state_point = (row['game_state'], row['attempt']+1, 0) state_index = ep.state_from_point_to_index(state_space, state_point) action_point = (row['agent_assistance']) action_index = action_point agent_policy_counter[state_index][action_index] += 1 row_t_0 = row['user_action'] else: - state_point = (row['game_state'], row['attempt'], row_t_0) + state_point = (row['game_state'], row['attempt']+1, row_t_0) state_index = ep.state_from_point_to_index(state_space, state_point) action_point = (row['agent_assistance']) action_index = action_point @@ -214,19 +245,51 @@ def compute_agent_policy(folder_pathname, user_id, with_feedback, state_space, a def main(): #################GENERATE SIMULATION################################ - # SIMULATION PARAMS - epochs = 10 - scaling_factor = 1 + parser = argparse.ArgumentParser() + parser.add_argument('--bn_user_model_filename', '--bn_user_model', type=str,help="file path of the user bn model", + default="/home/pal/Documents/Framework/bn_generative_model/bn_persona_model/persona_model.bif") + parser.add_argument('--bn_agent_model_filename', '--bn_agent_model', type=str,help="file path of the agent bn model", + default="/home/pal/Documents/Framework/bn_generative_model/bn_agent_model/agent_assistive_model.bif") + parser.add_argument('--epoch', '--epoch', type=int,help="number of epochs in the simulation", default=200) + parser.add_argument('--run', '--run', type=int, help="number of runs in the simulation", default=50) + parser.add_argument('--output_policy_filename', '--p', type=str,help="output policy from the simulation", + default="policy.pkl") + parser.add_argument('--output_reward_filename', '--r', type=str, help="output reward from the simulation", + default="reward.pkl") + parser.add_argument('--output_value_filename', '--v', type=str, help="output value function from the simulation", + default="value_function.pkl") + parser.add_argument('--therapist_patient_interaction_folder', '--tpi_path', type=str,help="therapist-patient interaction folder", + default="/home/pal/carf_ws/src/carf/caregiver_in_the_loop/log") + parser.add_argument('--agent_patient_interaction_folder', '--api_path', type=str,help="agent-patient interaction folder", + default="/home/pal/carf_ws/src/carf/robot_in_the_loop/log") + parser.add_argument('--user_id', '--id', type=int,help="user id") + parser.add_argument('--with_feedback', '--f', type=bool,help="offering sociable") + parser.add_argument('--session', '--s', type=int, help="session of the agent-human interaction") + + args = parser.parse_args() + + + # READ PARAMS FROM COMMAND LINE + user_id = args.user_id + with_feedback = args.with_feedback + session = args.session + epochs = args.epoch + runs = args.run # initialise the agent - bn_model_user_action_filename = '/home/pal/Documents/Framework/bn_generative_model/bn_persona_model/persona_model_test.bif' - bn_model_agent_behaviour_filename = '/home/pal/Documents/Framework/bn_generative_model/bn_agent_model/agent_assistive_model.bif' - learned_policy_filename = "" - bn_model_user_action = bnlearn.import_DAG(bn_model_user_action_filename) - bn_model_agent_behaviour = bnlearn.import_DAG(bn_model_agent_behaviour_filename) + bn_user_model_filename = args.bn_user_model_filename + bn_agent_model_filename = args.bn_agent_model_filename + learned_policy_filename = args.output_policy_filename + learned_reward_filename = args.output_reward_filename + learned_value_f_filename = args.output_value_filename + therapist_patient_interaction_folder = args.therapist_patient_interaction_folder + agent_patient_interaction_folder = args.agent_patient_interaction_folder + scaling_factor = 1 + + #import user and agent model + bn_user_model_action = bnlearn.import_DAG(bn_user_model_filename) + bn_agent_model_behaviour = bnlearn.import_DAG(bn_agent_model_filename) #setup by the caregiver - user_pref_assistance = 2 - agent_behaviour = "challenge" # define state space struct for the irl algorithm episode_instance = Episode() @@ -240,127 +303,131 @@ def main(): states_space_list = list(itertools.product(*state_space)) state_space_index = [episode_instance.state_from_point_to_index(states_space_list, s) for s in states_space_list] agent_assistance_action = [i for i in range(Agent_Assistance.counter.value)] - agent_feedback_action = [i for i in range(Agent_Feedback.counter.value)] action_space = (agent_assistance_action) - action_space_list = action_space#list(itertools.product(*action_space)) - action_space_index = action_space_list#[episode_instance.state_from_point_to_index(action_space_list, a) for a in action_space_list] + action_space_list = action_space terminal_state = [(Game_State.counter.value, i, user_action[j]) for i in range(1, Attempt.counter.value + 1) for j in range(len(user_action))] initial_state = (1, 1, 0) - agent_policy = [0 for s in state_space] + #output folders + output_folder_data_path = os.getcwd() + "/results/" + str(user_id) +"/"+str(with_feedback)+"/"+str(session) + if not os.path.exists(os.getcwd() + "/results"+"/"+str(user_id)): + os.mkdir(os.getcwd() + "/results"+"/"+str(user_id)) + if not os.path.exists(os.getcwd() + "/results"+"/"+str(user_id) +"/"+str(with_feedback)): + os.mkdir(os.getcwd() + "/results" + "/" +str(user_id) +"/"+str(with_feedback)) + if not os.path.exists(output_folder_data_path): + os.mkdir(output_folder_data_path) - #####################INPUT AND OUTPUT FOLDER #################################### - input_folder_data = "/home/pal/Documents/Framework/GenerativeMutualShapingRL/data" - user_id = 1 - with_feedback = True - output_folder_data = os.getcwd() + "/results/" + str(user_id) - if not os.path.exists(output_folder_data): - os.mkdir(output_folder_data) - if not os.path.exists(output_folder_data+"/"+str(with_feedback)): - os.mkdir(output_folder_data+"/"+with_feedback) +#1. CREATE INITIAL USER COGNITIVE MODEL FROM DATA + df_from_data, episode_length = merge_user_log(tpi_folder_pathname=therapist_patient_interaction_folder, + file_output=output_folder_data_path+"/summary_bn_variables_from_data.csv", + user_id=user_id, + with_feedback=with_feedback, + rpi_folder_pathname=agent_patient_interaction_folder, + column_to_remove=None) - #1. CREATE INITIAL USER COGNITIVE MODEL FROM DATA - df_from_data, episode_length = merge_user_log(folder_pathname=input_folder_data, - user_id=user_id, with_feedback=with_feedback, column_to_remove=None) #2. CREATE POLICY FROM DATA - agent_policy_from_data = compute_agent_policy(folder_pathname=input_folder_data, - user_id=user_id, with_feedback=with_feedback, state_space=states_space_list, + agent_policy_from_data = compute_agent_policy(training_set_filename=output_folder_data_path+"/summary_bn_variables_from_data.csv", + state_space=states_space_list, action_space=action_space_list, episode_length=episode_length) + det_agent_policy_from_data = list(map(lambda x:np.argmax(x), agent_policy_from_data)) + sns.heatmap(np.reshape(det_agent_policy_from_data, (4, 12)), cmap="Spectral", annot=True, cbar=False) + plt.savefig(output_folder_data_path + "/pi_from_data"+str(user_id)+"_"+str(with_feedback)+"_"+str(session)+".jpg") + # 3. RUN THE SIMULATION - log_directory = input_folder_data+"/"+str(user_id)+"/"+str(with_feedback) bn_model_user_action_from_data_and_therapist = None bn_model_agent_behaviour_from_data_and_therapist = None - if os.path.exists(log_directory): - bn_model_user_action_from_data_and_therapist = Sim.build_model_from_data(csv_filename=log_directory+"/summary_bn_variables.csv", dag_filename=bn_model_user_action_filename, dag_model=bn_model_user_action) - bn_model_agent_behaviour_from_data_and_therapist = Sim.build_model_from_data(csv_filename=log_directory+"/summary_bn_variables.csv", dag_filename=bn_model_agent_behaviour_filename, dag_model=bn_model_agent_behaviour) - else: - assert ("You're not using the user information") - question = input("Are you sure you don't want to load user's belief information?") - diff = dict([(s, [0] * len(action_space_index)) for s in state_space_index]) - entropy = dict([(s, 0) for s in state_space_index]) - N = 5 - - for i in range(N): - game_performance_per_episode, react_time_per_episode, agent_assistance_per_episode, agent_feedback_per_episode, episodes_list = \ - Sim.simulation(bn_model_user_action=bn_model_user_action_from_data_and_therapist, - bn_model_agent_behaviour = bn_model_agent_behaviour_from_data_and_therapist, - var_user_action_target_action=['user_action'], - var_agent_behaviour_target_action=['agent_assistance'], - game_state_bn_name="game_state", - attempt_bn_name="attempt", - agent_assistance_bn_name="agent_assistance", - agent_feedback_bn_name="agent_feedback", - user_pref_assistance=user_pref_assistance, - agent_behaviour=agent_behaviour, - agent_policy = agent_policy_from_data, - state_space=states_space_list, - action_space=action_space_list, - epochs=epochs, task_complexity=5, max_attempt_per_object=4, alpha_learning=0.1) - - plot_game_performance_path = output_folder_data+"/REAL_SIM_game_performance_" + "epoch_" + str(epochs) + ".jpg" - plot_agent_assistance_path = output_folder_data+"/REAL_SIM_agent_assistance_" + "epoch_" + str(epochs) + ".jpg" - plot_agent_feedback_path = output_folder_data+"/REAL_SIM_agent_feedback_" + "epoch_" + str(epochs) + ".jpg" - - utils.plot2D_game_performance(plot_game_performance_path, epochs, scaling_factor, game_performance_per_episode) - utils.plot2D_assistance(plot_agent_assistance_path, epochs, scaling_factor, agent_assistance_per_episode) - utils.plot2D_feedback(plot_agent_feedback_path, epochs, scaling_factor, agent_feedback_per_episode) - - cognitive_game_world, reward, terminals = setup_mdp(initial_state=initial_state, terminal_state=terminal_state, - task_length=Game_State.counter.value, n_max_attempt=Attempt.counter.value, - action_space=action_space_list, state_space=states_space_list, - user_action=user_action, timeout=15, episode=episodes_list) - - state_tuple_indexed = [states_space_list.index(tuple(s)) for s in (states_space_list)] - states_space_list_string = [[str(states_space_list[j*12+i]) for i in range(12)] for j in range(4)] - build_2dtable(states_space_list_string, 4, 12) - - # R(s) and pi(s) generated from the first sim - maxent_R_real_sim = maxent(world=cognitive_game_world, terminal=terminals, trajectories=episodes_list) - maxent_V_real_sim, maxent_P_real_sim = vi.value_iteration(cognitive_game_world.p_transition, maxent_R_real_sim, gamma=0.9, error=1e-3, - deterministic=True) - - learned_policy_filename = output_folder_data + "/" + "learned_policy.pkl" - with open(learned_policy_filename, 'wb') as f: - pickle.dump(maxent_P_real_sim, f, protocol=2) - - - for s in state_space_index: - index = maxent_P_real_sim[s] - diff[s][index] += 1.0 / N - sns.heatmap(np.reshape(maxent_P_real_sim, (4, 12)), cmap="Spectral", annot=True, cbar=False) - plt.savefig(output_folder_data + "maxent_P_iter_"+str(i)+".jpg") - - for s in state_space_index: - E = 0 - for i in range(len(action_space_index)): - if diff[s][i] > 0: - E -= diff[s][i] * math.log(diff[s][i]) - entropy[s] = E - - # highlight high and low entropy states - entropy_sort = sorted(entropy.items(), key=operator.itemgetter(1)) - s_preferences = [episode_instance.state_from_index_to_point(states_space_list, entropy_sort[i][0]) for i in range(-1, -6, -1)] - s_constraints = [episode_instance.state_from_index_to_point(states_space_list, entropy_sort[i][0]) for i in range(27)][22:] - print("S_preferences: ", s_preferences) - print("S_constrains: ", s_constraints) + if os.path.exists(output_folder_data_path): + bn_model_user_action_from_data_and_therapist = Sim.build_model_from_data( + csv_filename=output_folder_data_path + "/summary_bn_variables_from_data.csv", dag_filename=bn_user_model_filename, + dag_model=bn_user_model_action) + bn_model_agent_behaviour_from_data_and_therapist = Sim.build_model_from_data( + csv_filename=output_folder_data_path + "/summary_bn_variables_from_data.csv", dag_filename=bn_agent_model_filename, + dag_model=bn_agent_model_behaviour) + else: + assert ("You're not using the user information") + question = input("Are you sure you don't want to load user's belief information?") - plt.figure(figsize=(12, 4), num="maxent_rew") - sns.heatmap(np.reshape(maxent_R_real_sim, (4, 12)), cmap="Spectral", annot=True, cbar=False) - plt.savefig(output_folder_data + "real_sim_maxent_R.jpg") - plt.figure(figsize=(12, 4), num="maxent_V") - sns.heatmap(np.reshape(maxent_V_real_sim, (4, 12)), cmap="Spectral", annot=True, cbar=False) - plt.savefig(output_folder_data + "real_sim_maxent_V.jpg") - plt.figure(figsize=(12, 4), num="maxent_P") - sns.heatmap(np.reshape(maxent_P_real_sim, (4, 12)), cmap="Spectral", annot=True, cbar=False) - plt.savefig(output_folder_data + "real_sim_maxent_P.jpg") - + game_performance_per_episode, agent_assistance_per_episode, episodes = \ + Sim.simulation(bn_model_user_action=bn_model_user_action_from_data_and_therapist, + bn_model_agent_behaviour = bn_model_agent_behaviour_from_data_and_therapist, + var_user_action_target_action=['user_action'], + var_agent_behaviour_target_action=['agent_assistance'], + game_state_bn_name="game_state", + attempt_bn_name="attempt", + agent_assistance_bn_name="agent_assistance", + agent_policy = [], + state_space=states_space_list, + action_space=action_space_list, + epoch=epochs, + run=runs, + task_complexity=5, + max_attempt_per_object=4, + alpha_learning=0.1) + + plot_game_performance_path = output_folder_data_path+"/game_performance_" + "epoch_" + str(epochs) + ".jpg" + plot_agent_assistance_path = output_folder_data_path+"/agent_assistance_" + "epoch_" + str(epochs) + ".jpg" + + utils.plot2D_game_performance(plot_game_performance_path, epochs, scaling_factor, game_performance_per_episode) + utils.plot2D_assistance(plot_agent_assistance_path, epochs, scaling_factor, agent_assistance_per_episode) + + # add episodes from different policies + # for e in range(len(episodes)): + # episodes_from_different_policies.append(Episode(episodes[e]._t)) + + cognitive_game_world, reward, terminals = setup_mdp(initial_state=initial_state, terminal_state=terminal_state, + task_length=Game_State.counter.value, n_max_attempt=Attempt.counter.value, + action_space=action_space_list, state_space=states_space_list, + user_action=user_action, timeout=15, episode=episodes) + + # state_tuple_indexed = [states_space_list.index(tuple(s)) for s in (states_space_list)] + # states_space_list_string = [[str(states_space_list[j*12+i]) for i in range(12)] for j in range(4)] + # build_2dtable(states_space_list_string, 4, 12) + + + + # R(s) and pi(s) generated from the first sim + maxent_R = maxent(world=cognitive_game_world, terminal=terminals, trajectories=episodes) + maxent_V, maxent_P = vi.value_iteration(cognitive_game_world.p_transition, maxent_R, gamma=0.99, error=1e-2, + deterministic=False) + print(maxent_P) + with open(output_folder_data_path+"/"+learned_policy_filename, 'wb') as f: + pickle.dump(maxent_P, f, protocol=2) + with open(output_folder_data_path+"/"+learned_reward_filename, 'wb') as f: + pickle.dump(maxent_R, f, protocol=2) + with open(output_folder_data_path+"/"+learned_value_f_filename, 'wb') as f: + pickle.dump(maxent_V, f, protocol=2) + + # if n>0: + # + # s_constraints, s_preferences = get_entropy([policies_from_sim[-1], det_agent_policy_from_data], state_space_index, action_space_index) + # print("S_preferences: ", s_preferences) + # print("S_constrains: ", s_constraints) + # for state_index in s_constraints: + # action = np.argmax(agent_policy_from_data[state_index]) + # for action_index in range(len(maxent_P_real_sim[state_index])): + # if action == action_index: + # maxent_P_real_sim[state_index][action_index] = (0.9) + # else: + # maxent_P_real_sim[state_index][action_index] = 0.02 + # maxent_P_real_sim[state_index] = list(map(lambda x:x/sum(maxent_P_real_sim[state_index]), maxent_P_real_sim[state_index])) + + sns.heatmap(np.reshape(maxent_R, (4, 12)), cmap="Spectral", annot=True, cbar=False) + plt.savefig(output_folder_data_path + "/maxent_R.jpg") + plt.show() + sns.heatmap(np.reshape(maxent_V, (4, 12)), cmap="Spectral", annot=True, cbar=False) + plt.savefig(output_folder_data_path + "/maxent_V.jpg") + plt.show() + maxent_P_det = list(map(lambda x: np.argmax(x), maxent_P)) + sns.heatmap(np.reshape(maxent_P_det, (4, 12)), cmap="Spectral", annot=True, cbar=False) + plt.savefig(output_folder_data_path + "/maxent_P.jpg") + plt.show()