Skip to content
Snippets Groups Projects
Commit 027ea369 authored by Antonio Andriella's avatar Antonio Andriella
Browse files

working version of the entire framework for generating a policy

parent 90c0962f
No related branches found
No related tags found
No related merge requests found
.idea
user_react_time,game_state,attempt,agent_feedback,user_memory,user_action,agent_assistance,user_reactivity
2,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0
1,1,0,0,0,0,1,0
0,1,0,0,0,2,0,0
0,1,1,0,0,0,2,0
0,2,0,0,0,2,0,0
1,2,1,0,0,0,1,0
cum_elapsed_time,to,cum_react_time,from,timeout,user_action,token_id,caregiver_feedback,avg_caregiver_assistance_per_move,attempt
7.621,1,0.0,13,0,,289,0,0.0,1
0.71,2,7.663,8,0,,311,0,0.0,1
1.738,3,5.253,15,0,,321,0,0.0,1
0.931,4,10.58,20,1,,499,0,0.0,2
0.826,5,6.484,12,2,,537,0,0.0,2
with_feedback,objective,session,user_id,timeout
True,ascending_odd,2,1,15
game_state,user_action,attempt,caregiver_feedback,react_time,caregiver_assistance,user_react_time,from,elapsed_time,to,timeout,token_id
0,0,1,0,0.0,0,2,13,7.621,1,0,289
0,0,1,0,7.663,0,1,8,0.71,2,0,311
1,0,1,0,5.253,0,1,15,1.738,3,0,321
1,2,1,0,15,0,0,,0,,0,
1,0,2,0,10.58,0,0,20,0.931,4,1,499
2,2,1,0,15,0,0,,0,,1,
2,0,2,0,6.484,0,1,12,0.826,5,2,537
tot_elapsed_time,avg_lev_assistance,tot_react_time,n_timeout,n_attempt,n_sociable
11.826,0.0,29.980000000000004,3,7,0
......@@ -22,9 +22,9 @@ import itertools
import os
import math
import operator
import datetime
import pickle
import bnlearn
import argparse
from cognitive_game_env import CognitiveGame
from episode import Episode
......@@ -108,10 +108,10 @@ def get_entropy(policies, state_space, action_space):
# highlight high and low entropy states
entropy_sort = sorted(entropy.items(), key=operator.itemgetter(1))
s_preferences = [entropy_sort[i][0] for i in range(-1, -6, -1)]
s_constraints = [entropy_sort[i][0] for i in range(5)]
s_constraints = [entropy_sort[i][0] for i in range(-1, -6, -1)]
s_preferences = [entropy_sort[i][0] for i in range(5)]
return s_preferences, s_constraints
return s_constraints, s_preferences
def maxent(world, terminal, trajectories):
......@@ -125,6 +125,7 @@ def maxent(world, terminal, trajectories):
estimation of the reward based on the MEIRL
"""
# set up features: we use one feature vector per state
#features = world.state_features()
features = world.assistive_feature(trajectories)
# choose our parameter initialization strategy:
......@@ -136,7 +137,8 @@ def maxent(world, terminal, trajectories):
optim = O.ExpSga(lr=O.linear_decay(lr0=0.1))
# actually do some inverse reinforcement learning
reward = M.irl(world.p_transition, features, terminal, trajectories, optim, init)
reward = M.irl_causal(world.p_transition, features, terminal, trajectories, optim, init, discount=0.1)
#reward = M.irl(world.p_transition, features, terminal, trajectories, optim, init)
return reward
......@@ -157,22 +159,28 @@ def merge_agent_policy(policy_from_data, policy_from_therapist):
merged_policy[index]
merged_policy[index] = list(map(lambda x:sum(x), ))
def merge_user_log(folder_pathname, user_id, with_feedback, column_to_remove):
absolute_path = folder_pathname+"/"+str(+user_id)+"/"+str(with_feedback)
def merge_user_log(tpi_folder_pathname, file_output, user_id, with_feedback, rpi_folder_pathname=None, column_to_remove=None):
tpi_absolute_path = tpi_folder_pathname+"/"+str(+user_id)+"/"+str(with_feedback)
if rpi_folder_pathname!=None:
rpi_absolute_path = rpi_folder_pathname+"/"+str(+user_id)+"/"+str(with_feedback)
else:
rpi_absolute_path=None
episodes_length, tpi_episode_length, rpi_episode_length = [], [], []
df = pd.DataFrame()
if len(absolute_path)==0:
print("Error no folders in path ", absolute_path)
# else:
# df = pd.read_csv(absolute_path+"/1/bn_variables.csv")
if len(tpi_absolute_path)==0:
print("Error no folders in path ", tpi_absolute_path)
if column_to_remove!=None:
df = df.drop(column_to_remove, axis=1)
#df_removed = df.drop(["user_memory", "user_reactivity"], axis=1)
sessions_directory = os.listdir(absolute_path)
episode_length = [0]*(len(sessions_directory)+1)
tpi_sessions_directory = os.listdir(tpi_absolute_path)
tpi_episode_length = [0]*(len(tpi_sessions_directory)+1)
for i in range(len(sessions_directory)):
file_folder = absolute_path+"/"+sessions_directory[i]
files = None
for i in range(len(tpi_sessions_directory)):
file_folder = tpi_absolute_path+"/"+tpi_sessions_directory[i]
print("File folder: ", file_folder)
files = os.listdir(file_folder)
......@@ -180,27 +188,50 @@ def merge_user_log(folder_pathname, user_id, with_feedback, column_to_remove):
if files[k] == "bn_variables.csv":
df_ = pd.read_csv(file_folder+"/"+files[k])
df = df.append(df_)
episode_length[i+1] = episode_length[i]+(df_.shape[0]-1)+1
df.to_csv(absolute_path + "/summary_bn_variables.csv", index=False)
return df, episode_length
tpi_episode_length[i+1] = tpi_episode_length[i]+(df_.shape[0]-1)+1
if rpi_folder_pathname!=None and len(rpi_absolute_path)!=None:
rpi_sessions_directory = os.listdir(rpi_absolute_path)
rpi_episode_length = [0] * len(rpi_sessions_directory)
files = None
for i in range(len(rpi_sessions_directory)):
file_folder = rpi_absolute_path + "/" + rpi_sessions_directory[i]
print("File folder: ", file_folder)
files = os.listdir(file_folder)
for k in range(len(files)):
if files[k] == "bn_variables.csv":
df_ = pd.read_csv(file_folder + "/" + files[k])
df = df.append(df_)
rpi_episode_length[i] = (sum(rpi_episode_length)+ (df_.shape[0] - 1) + 1)
rpi_episode_length = [rpi_episode_length[i]+tpi_episode_length[-1] for i in range(len(rpi_episode_length))]
else:
print("You are not considering the data collected from the interaction with the robot")
df.to_csv(file_output, index=False)
episodes_length = tpi_episode_length+rpi_episode_length
return df, (episodes_length)
def compute_agent_policy(folder_pathname, user_id, with_feedback, state_space, action_space, episode_length):
def compute_agent_policy(training_set_filename, state_space, action_space, episode_length):
#read columns of interest (game_state, attempt, user_prev_action)
ep = Episode()
df = pd.read_csv(folder_pathname+"/"+str(user_id)+"/"+str(with_feedback)+"/summary_bn_variables.csv")
df = pd.read_csv(training_set_filename)
agent_policy_counter = [[0 for a in action_space] for s in state_space]
agent_policy_prob = [[0 for a in action_space] for s in state_space]
row_t_0 = 0
for index, row in df.iterrows():
if index == 0 or index in episode_length:
state_point = (row['game_state'], row['attempt'], 0)
state_point = (row['game_state'], row['attempt']+1, 0)
state_index = ep.state_from_point_to_index(state_space, state_point)
action_point = (row['agent_assistance'])
action_index = action_point
agent_policy_counter[state_index][action_index] += 1
row_t_0 = row['user_action']
else:
state_point = (row['game_state'], row['attempt'], row_t_0)
state_point = (row['game_state'], row['attempt']+1, row_t_0)
state_index = ep.state_from_point_to_index(state_space, state_point)
action_point = (row['agent_assistance'])
action_index = action_point
......@@ -214,19 +245,51 @@ def compute_agent_policy(folder_pathname, user_id, with_feedback, state_space, a
def main():
#################GENERATE SIMULATION################################
# SIMULATION PARAMS
epochs = 10
scaling_factor = 1
parser = argparse.ArgumentParser()
parser.add_argument('--bn_user_model_filename', '--bn_user_model', type=str,help="file path of the user bn model",
default="/home/pal/Documents/Framework/bn_generative_model/bn_persona_model/persona_model.bif")
parser.add_argument('--bn_agent_model_filename', '--bn_agent_model', type=str,help="file path of the agent bn model",
default="/home/pal/Documents/Framework/bn_generative_model/bn_agent_model/agent_assistive_model.bif")
parser.add_argument('--epoch', '--epoch', type=int,help="number of epochs in the simulation", default=200)
parser.add_argument('--run', '--run', type=int, help="number of runs in the simulation", default=50)
parser.add_argument('--output_policy_filename', '--p', type=str,help="output policy from the simulation",
default="policy.pkl")
parser.add_argument('--output_reward_filename', '--r', type=str, help="output reward from the simulation",
default="reward.pkl")
parser.add_argument('--output_value_filename', '--v', type=str, help="output value function from the simulation",
default="value_function.pkl")
parser.add_argument('--therapist_patient_interaction_folder', '--tpi_path', type=str,help="therapist-patient interaction folder",
default="/home/pal/carf_ws/src/carf/caregiver_in_the_loop/log")
parser.add_argument('--agent_patient_interaction_folder', '--api_path', type=str,help="agent-patient interaction folder",
default="/home/pal/carf_ws/src/carf/robot_in_the_loop/log")
parser.add_argument('--user_id', '--id', type=int,help="user id")
parser.add_argument('--with_feedback', '--f', type=bool,help="offering sociable")
parser.add_argument('--session', '--s', type=int, help="session of the agent-human interaction")
args = parser.parse_args()
# READ PARAMS FROM COMMAND LINE
user_id = args.user_id
with_feedback = args.with_feedback
session = args.session
epochs = args.epoch
runs = args.run
# initialise the agent
bn_model_user_action_filename = '/home/pal/Documents/Framework/bn_generative_model/bn_persona_model/persona_model_test.bif'
bn_model_agent_behaviour_filename = '/home/pal/Documents/Framework/bn_generative_model/bn_agent_model/agent_assistive_model.bif'
learned_policy_filename = ""
bn_model_user_action = bnlearn.import_DAG(bn_model_user_action_filename)
bn_model_agent_behaviour = bnlearn.import_DAG(bn_model_agent_behaviour_filename)
bn_user_model_filename = args.bn_user_model_filename
bn_agent_model_filename = args.bn_agent_model_filename
learned_policy_filename = args.output_policy_filename
learned_reward_filename = args.output_reward_filename
learned_value_f_filename = args.output_value_filename
therapist_patient_interaction_folder = args.therapist_patient_interaction_folder
agent_patient_interaction_folder = args.agent_patient_interaction_folder
scaling_factor = 1
#import user and agent model
bn_user_model_action = bnlearn.import_DAG(bn_user_model_filename)
bn_agent_model_behaviour = bnlearn.import_DAG(bn_agent_model_filename)
#setup by the caregiver
user_pref_assistance = 2
agent_behaviour = "challenge"
# define state space struct for the irl algorithm
episode_instance = Episode()
......@@ -240,127 +303,131 @@ def main():
states_space_list = list(itertools.product(*state_space))
state_space_index = [episode_instance.state_from_point_to_index(states_space_list, s) for s in states_space_list]
agent_assistance_action = [i for i in range(Agent_Assistance.counter.value)]
agent_feedback_action = [i for i in range(Agent_Feedback.counter.value)]
action_space = (agent_assistance_action)
action_space_list = action_space#list(itertools.product(*action_space))
action_space_index = action_space_list#[episode_instance.state_from_point_to_index(action_space_list, a) for a in action_space_list]
action_space_list = action_space
terminal_state = [(Game_State.counter.value, i, user_action[j]) for i in range(1, Attempt.counter.value + 1) for j in
range(len(user_action))]
initial_state = (1, 1, 0)
agent_policy = [0 for s in state_space]
#output folders
output_folder_data_path = os.getcwd() + "/results/" + str(user_id) +"/"+str(with_feedback)+"/"+str(session)
if not os.path.exists(os.getcwd() + "/results"+"/"+str(user_id)):
os.mkdir(os.getcwd() + "/results"+"/"+str(user_id))
if not os.path.exists(os.getcwd() + "/results"+"/"+str(user_id) +"/"+str(with_feedback)):
os.mkdir(os.getcwd() + "/results" + "/" +str(user_id) +"/"+str(with_feedback))
if not os.path.exists(output_folder_data_path):
os.mkdir(output_folder_data_path)
#####################INPUT AND OUTPUT FOLDER ####################################
input_folder_data = "/home/pal/Documents/Framework/GenerativeMutualShapingRL/data"
user_id = 1
with_feedback = True
output_folder_data = os.getcwd() + "/results/" + str(user_id)
if not os.path.exists(output_folder_data):
os.mkdir(output_folder_data)
if not os.path.exists(output_folder_data+"/"+str(with_feedback)):
os.mkdir(output_folder_data+"/"+with_feedback)
#1. CREATE INITIAL USER COGNITIVE MODEL FROM DATA
df_from_data, episode_length = merge_user_log(tpi_folder_pathname=therapist_patient_interaction_folder,
file_output=output_folder_data_path+"/summary_bn_variables_from_data.csv",
user_id=user_id,
with_feedback=with_feedback,
rpi_folder_pathname=agent_patient_interaction_folder,
column_to_remove=None)
#1. CREATE INITIAL USER COGNITIVE MODEL FROM DATA
df_from_data, episode_length = merge_user_log(folder_pathname=input_folder_data,
user_id=user_id, with_feedback=with_feedback, column_to_remove=None)
#2. CREATE POLICY FROM DATA
agent_policy_from_data = compute_agent_policy(folder_pathname=input_folder_data,
user_id=user_id, with_feedback=with_feedback, state_space=states_space_list,
agent_policy_from_data = compute_agent_policy(training_set_filename=output_folder_data_path+"/summary_bn_variables_from_data.csv",
state_space=states_space_list,
action_space=action_space_list, episode_length=episode_length)
det_agent_policy_from_data = list(map(lambda x:np.argmax(x), agent_policy_from_data))
sns.heatmap(np.reshape(det_agent_policy_from_data, (4, 12)), cmap="Spectral", annot=True, cbar=False)
plt.savefig(output_folder_data_path + "/pi_from_data"+str(user_id)+"_"+str(with_feedback)+"_"+str(session)+".jpg")
# 3. RUN THE SIMULATION
log_directory = input_folder_data+"/"+str(user_id)+"/"+str(with_feedback)
bn_model_user_action_from_data_and_therapist = None
bn_model_agent_behaviour_from_data_and_therapist = None
if os.path.exists(log_directory):
bn_model_user_action_from_data_and_therapist = Sim.build_model_from_data(csv_filename=log_directory+"/summary_bn_variables.csv", dag_filename=bn_model_user_action_filename, dag_model=bn_model_user_action)
bn_model_agent_behaviour_from_data_and_therapist = Sim.build_model_from_data(csv_filename=log_directory+"/summary_bn_variables.csv", dag_filename=bn_model_agent_behaviour_filename, dag_model=bn_model_agent_behaviour)
else:
assert ("You're not using the user information")
question = input("Are you sure you don't want to load user's belief information?")
diff = dict([(s, [0] * len(action_space_index)) for s in state_space_index])
entropy = dict([(s, 0) for s in state_space_index])
N = 5
for i in range(N):
game_performance_per_episode, react_time_per_episode, agent_assistance_per_episode, agent_feedback_per_episode, episodes_list = \
Sim.simulation(bn_model_user_action=bn_model_user_action_from_data_and_therapist,
bn_model_agent_behaviour = bn_model_agent_behaviour_from_data_and_therapist,
var_user_action_target_action=['user_action'],
var_agent_behaviour_target_action=['agent_assistance'],
game_state_bn_name="game_state",
attempt_bn_name="attempt",
agent_assistance_bn_name="agent_assistance",
agent_feedback_bn_name="agent_feedback",
user_pref_assistance=user_pref_assistance,
agent_behaviour=agent_behaviour,
agent_policy = agent_policy_from_data,
state_space=states_space_list,
action_space=action_space_list,
epochs=epochs, task_complexity=5, max_attempt_per_object=4, alpha_learning=0.1)
plot_game_performance_path = output_folder_data+"/REAL_SIM_game_performance_" + "epoch_" + str(epochs) + ".jpg"
plot_agent_assistance_path = output_folder_data+"/REAL_SIM_agent_assistance_" + "epoch_" + str(epochs) + ".jpg"
plot_agent_feedback_path = output_folder_data+"/REAL_SIM_agent_feedback_" + "epoch_" + str(epochs) + ".jpg"
utils.plot2D_game_performance(plot_game_performance_path, epochs, scaling_factor, game_performance_per_episode)
utils.plot2D_assistance(plot_agent_assistance_path, epochs, scaling_factor, agent_assistance_per_episode)
utils.plot2D_feedback(plot_agent_feedback_path, epochs, scaling_factor, agent_feedback_per_episode)
cognitive_game_world, reward, terminals = setup_mdp(initial_state=initial_state, terminal_state=terminal_state,
task_length=Game_State.counter.value, n_max_attempt=Attempt.counter.value,
action_space=action_space_list, state_space=states_space_list,
user_action=user_action, timeout=15, episode=episodes_list)
state_tuple_indexed = [states_space_list.index(tuple(s)) for s in (states_space_list)]
states_space_list_string = [[str(states_space_list[j*12+i]) for i in range(12)] for j in range(4)]
build_2dtable(states_space_list_string, 4, 12)
# R(s) and pi(s) generated from the first sim
maxent_R_real_sim = maxent(world=cognitive_game_world, terminal=terminals, trajectories=episodes_list)
maxent_V_real_sim, maxent_P_real_sim = vi.value_iteration(cognitive_game_world.p_transition, maxent_R_real_sim, gamma=0.9, error=1e-3,
deterministic=True)
learned_policy_filename = output_folder_data + "/" + "learned_policy.pkl"
with open(learned_policy_filename, 'wb') as f:
pickle.dump(maxent_P_real_sim, f, protocol=2)
for s in state_space_index:
index = maxent_P_real_sim[s]
diff[s][index] += 1.0 / N
sns.heatmap(np.reshape(maxent_P_real_sim, (4, 12)), cmap="Spectral", annot=True, cbar=False)
plt.savefig(output_folder_data + "maxent_P_iter_"+str(i)+".jpg")
for s in state_space_index:
E = 0
for i in range(len(action_space_index)):
if diff[s][i] > 0:
E -= diff[s][i] * math.log(diff[s][i])
entropy[s] = E
# highlight high and low entropy states
entropy_sort = sorted(entropy.items(), key=operator.itemgetter(1))
s_preferences = [episode_instance.state_from_index_to_point(states_space_list, entropy_sort[i][0]) for i in range(-1, -6, -1)]
s_constraints = [episode_instance.state_from_index_to_point(states_space_list, entropy_sort[i][0]) for i in range(27)][22:]
print("S_preferences: ", s_preferences)
print("S_constrains: ", s_constraints)
if os.path.exists(output_folder_data_path):
bn_model_user_action_from_data_and_therapist = Sim.build_model_from_data(
csv_filename=output_folder_data_path + "/summary_bn_variables_from_data.csv", dag_filename=bn_user_model_filename,
dag_model=bn_user_model_action)
bn_model_agent_behaviour_from_data_and_therapist = Sim.build_model_from_data(
csv_filename=output_folder_data_path + "/summary_bn_variables_from_data.csv", dag_filename=bn_agent_model_filename,
dag_model=bn_agent_model_behaviour)
else:
assert ("You're not using the user information")
question = input("Are you sure you don't want to load user's belief information?")
plt.figure(figsize=(12, 4), num="maxent_rew")
sns.heatmap(np.reshape(maxent_R_real_sim, (4, 12)), cmap="Spectral", annot=True, cbar=False)
plt.savefig(output_folder_data + "real_sim_maxent_R.jpg")
plt.figure(figsize=(12, 4), num="maxent_V")
sns.heatmap(np.reshape(maxent_V_real_sim, (4, 12)), cmap="Spectral", annot=True, cbar=False)
plt.savefig(output_folder_data + "real_sim_maxent_V.jpg")
plt.figure(figsize=(12, 4), num="maxent_P")
sns.heatmap(np.reshape(maxent_P_real_sim, (4, 12)), cmap="Spectral", annot=True, cbar=False)
plt.savefig(output_folder_data + "real_sim_maxent_P.jpg")
game_performance_per_episode, agent_assistance_per_episode, episodes = \
Sim.simulation(bn_model_user_action=bn_model_user_action_from_data_and_therapist,
bn_model_agent_behaviour = bn_model_agent_behaviour_from_data_and_therapist,
var_user_action_target_action=['user_action'],
var_agent_behaviour_target_action=['agent_assistance'],
game_state_bn_name="game_state",
attempt_bn_name="attempt",
agent_assistance_bn_name="agent_assistance",
agent_policy = [],
state_space=states_space_list,
action_space=action_space_list,
epoch=epochs,
run=runs,
task_complexity=5,
max_attempt_per_object=4,
alpha_learning=0.1)
plot_game_performance_path = output_folder_data_path+"/game_performance_" + "epoch_" + str(epochs) + ".jpg"
plot_agent_assistance_path = output_folder_data_path+"/agent_assistance_" + "epoch_" + str(epochs) + ".jpg"
utils.plot2D_game_performance(plot_game_performance_path, epochs, scaling_factor, game_performance_per_episode)
utils.plot2D_assistance(plot_agent_assistance_path, epochs, scaling_factor, agent_assistance_per_episode)
# add episodes from different policies
# for e in range(len(episodes)):
# episodes_from_different_policies.append(Episode(episodes[e]._t))
cognitive_game_world, reward, terminals = setup_mdp(initial_state=initial_state, terminal_state=terminal_state,
task_length=Game_State.counter.value, n_max_attempt=Attempt.counter.value,
action_space=action_space_list, state_space=states_space_list,
user_action=user_action, timeout=15, episode=episodes)
# state_tuple_indexed = [states_space_list.index(tuple(s)) for s in (states_space_list)]
# states_space_list_string = [[str(states_space_list[j*12+i]) for i in range(12)] for j in range(4)]
# build_2dtable(states_space_list_string, 4, 12)
# R(s) and pi(s) generated from the first sim
maxent_R = maxent(world=cognitive_game_world, terminal=terminals, trajectories=episodes)
maxent_V, maxent_P = vi.value_iteration(cognitive_game_world.p_transition, maxent_R, gamma=0.99, error=1e-2,
deterministic=False)
print(maxent_P)
with open(output_folder_data_path+"/"+learned_policy_filename, 'wb') as f:
pickle.dump(maxent_P, f, protocol=2)
with open(output_folder_data_path+"/"+learned_reward_filename, 'wb') as f:
pickle.dump(maxent_R, f, protocol=2)
with open(output_folder_data_path+"/"+learned_value_f_filename, 'wb') as f:
pickle.dump(maxent_V, f, protocol=2)
# if n>0:
#
# s_constraints, s_preferences = get_entropy([policies_from_sim[-1], det_agent_policy_from_data], state_space_index, action_space_index)
# print("S_preferences: ", s_preferences)
# print("S_constrains: ", s_constraints)
# for state_index in s_constraints:
# action = np.argmax(agent_policy_from_data[state_index])
# for action_index in range(len(maxent_P_real_sim[state_index])):
# if action == action_index:
# maxent_P_real_sim[state_index][action_index] = (0.9)
# else:
# maxent_P_real_sim[state_index][action_index] = 0.02
# maxent_P_real_sim[state_index] = list(map(lambda x:x/sum(maxent_P_real_sim[state_index]), maxent_P_real_sim[state_index]))
sns.heatmap(np.reshape(maxent_R, (4, 12)), cmap="Spectral", annot=True, cbar=False)
plt.savefig(output_folder_data_path + "/maxent_R.jpg")
plt.show()
sns.heatmap(np.reshape(maxent_V, (4, 12)), cmap="Spectral", annot=True, cbar=False)
plt.savefig(output_folder_data_path + "/maxent_V.jpg")
plt.show()
maxent_P_det = list(map(lambda x: np.argmax(x), maxent_P))
sns.heatmap(np.reshape(maxent_P_det, (4, 12)), cmap="Spectral", annot=True, cbar=False)
plt.savefig(output_folder_data_path + "/maxent_P.jpg")
plt.show()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment