from recsim.agent import AbstractEpisodicRecommenderAgent import numpy as np class GreedyAgent(AbstractEpisodicRecommenderAgent): def __init__(self, sess, observation_space, action_space, eval_mode, summary_writer): super(GreedyAgent, self).__init__(action_space, summary_writer) self._num_candidates = int(action_space.nvec[0]) self._W = np.array([[3, 1.5, 0.5]] * self._num_candidates) assert self._slate_size == 1 def begin_episode(self, observation=None): user = observation['user'] docs = observation['doc'] if 'W' in user: # use observable W self._W = user['W'] else: w = [] for doc_id in docs: w.append(docs[doc_id]) self._W = np.array(w).reshape((-1, 3)) print("agent W:", self._W) self._episode_num += 1 return self.step(0, observation) def step(self, reward, observation): docs = observation['doc'] user = observation['user'] base_pr = self.calc_prs(user['time'], user['last_review'], user['history'], self._W) # np.exp(-last_review / np.exp(np.dot(W, x))).squeeze() max_pr = -self._num_candidates max_id = 0 for did in docs: doc_id = int(did) last_review = user['last_review'].copy() history = user['history'].copy() last_review[doc_id] = user['time'] time = user['time'] + 1 history[doc_id][0] += 1 history[doc_id][1] += 1 pr1 = self.calc_prs(time, last_review, history, self._W) history[doc_id][1] -= 1 history[doc_id][2] += 1 pr2 = self.calc_prs(time, last_review, history, self._W) pr = (pr1 + pr2) / 2 - base_pr sum_pr = np.sum(pr) if sum_pr > max_pr: max_pr = sum_pr max_id = doc_id # print("pr1", pr1) # print("pr2", pr2) # print("pr0", base_pr) print(f"choose doc{max_id} with marginal gain {max_pr}") return [max_id] def calc_prs(self, train_time, last_review, history, W): last_review = train_time - last_review mem_param = np.exp(np.einsum('ij,ij->i', history, W)) pr = np.exp(-last_review / mem_param) return pr