RecSim_FlashcardLearning/agent/greedy.py

from recsim.agent import AbstractEpisodicRecommenderAgent
import numpy as np

class GreedyAgent(AbstractEpisodicRecommenderAgent):
  def __init__(self, sess, observation_space, action_space, eval_mode, summary_writer):
    super(GreedyAgent, self).__init__(action_space, summary_writer)
    self._num_candidates = int(action_space.nvec[0])
    self._W = np.array([[3, 1.5, 0.5]] * self._num_candidates)
    assert self._slate_size == 1
  def begin_episode(self, observation=None):
    user = observation['user']
    docs = observation['doc']
    if 'W' in user: # use observable W
      self._W = user['W']
    else:
      w = []
      for doc_id in docs:
        w.append(docs[doc_id])
      self._W = np.array(w).reshape((-1, 3))
      print("agent W:", self._W)
    self._episode_num += 1
    return self.step(0, observation)
  def step(self, reward, observation):
    docs = observation['doc']
    user = observation['user']

    base_pr = self.calc_prs(user['time'], user['last_review'], user['history'], self._W)
    # np.exp(-last_review / np.exp(np.dot(W, x))).squeeze()
    max_pr = -self._num_candidates
    max_id = 0
    for did in docs:
      doc_id = int(did)
      last_review = user['last_review'].copy()
      history =  user['history'].copy()
      last_review[doc_id] = user['time']
      time = user['time'] + 1

      history[doc_id][0] += 1
      history[doc_id][1] += 1
      pr1 = self.calc_prs(time, last_review, history, self._W)
      history[doc_id][1] -= 1
      history[doc_id][2] += 1
      pr2 = self.calc_prs(time, last_review, history, self._W)
      pr = (pr1 + pr2) / 2 - base_pr
      sum_pr = np.sum(pr)
      if sum_pr > max_pr:
        max_pr = sum_pr
        max_id = doc_id
      # print("pr1", pr1)
      # print("pr2", pr2)
      # print("pr0", base_pr)
    print(f"choose doc{max_id} with marginal gain {max_pr}")
    return [max_id]
  def calc_prs(self, train_time, last_review, history, W):
    last_review = train_time - last_review
    mem_param = np.exp(np.einsum('ij,ij->i', history, W))
    pr = np.exp(-last_review / mem_param)
    return pr