test greedy agent

2023-10-29 17:29:17 +08:00
3 changed files with 59 additions and 60 deletions
--- a/agent/init.py
+++ b/agent/init.py
@ -1,2 +1,2 @@
 from .util import create_agent_helper
-from .ucb_agent import UCBAgent
+from .greedy import GreedyAgent
--- a/agent/greedy.py
+++ b/agent/greedy.py
@ -0,0 +1,58 @@
+from recsim.agent import AbstractEpisodicRecommenderAgent
+import numpy as np
+
+class GreedyAgent(AbstractEpisodicRecommenderAgent):
+  def __init__(self, sess, observation_space, action_space, eval_mode, summary_writer):
+    super(GreedyAgent, self).__init__(action_space, summary_writer)
+    self._num_candidates = int(action_space.nvec[0])
+    self._W = np.array([[3, 1.5, 0.5]] * self._num_candidates)
+    assert self._slate_size == 1
+  def begin_episode(self, observation=None):
+    user = observation['user']
+    docs = observation['doc']
+    if 'W' in user: # use observable W
+      self._W = user['W']
+    else:
+      w = []
+      for doc_id in docs:
+        w.append(docs[doc_id])
+      self._W = np.array(w).reshape((-1, 3))
+      print("agent W:", self._W)
+    self._episode_num += 1
+    return self.step(0, observation)
+  def step(self, reward, observation):
+    docs = observation['doc']
+    user = observation['user']
+
+    base_pr = self.calc_prs(user['time'], user['last_review'], user['history'], self._W)
+    # np.exp(-last_review / np.exp(np.dot(W, x))).squeeze()
+    max_pr = -self._num_candidates
+    max_id = 0
+    for did in docs:
+      doc_id = int(did)
+      last_review = user['last_review'].copy()
+      history =  user['history'].copy()
+      last_review[doc_id] = user['time']
+      time = user['time'] + 1
+
+      history[doc_id][0] += 1
+      history[doc_id][1] += 1
+      pr1 = self.calc_prs(time, last_review, history, self._W)
+      history[doc_id][1] -= 1
+      history[doc_id][2] += 1
+      pr2 = self.calc_prs(time, last_review, history, self._W)
+      pr = (pr1 + pr2) / 2 - base_pr
+      sum_pr = np.sum(pr)
+      if sum_pr > max_pr:
+        max_pr = sum_pr
+        max_id = doc_id
+      # print("pr1", pr1)
+      # print("pr2", pr2)
+      # print("pr0", base_pr)
+    print(f"choose doc{max_id} with marginal gain {max_pr}")
+    return [max_id]
+  def calc_prs(self, train_time, last_review, history, W):
+    last_review = train_time - last_review
+    mem_param = np.exp(np.einsum('ij,ij->i', history, W))
+    pr = np.exp(-last_review / mem_param)
+    return pr
--- a/agent/ucb_agent.py
+++ b/agent/ucb_agent.py
@ -1,59 +0,0 @@
-from recsim.agent import AbstractEpisodicRecommenderAgent
-import tensorflow as tf
-import numpy as np
-
-class UCBAgent(AbstractEpisodicRecommenderAgent):
-  def __init__(self, sess, observation_space, action_space, eval_mode, alpha=1.0, learning_rate=0.001, summary_writer=None):
-    super(UCBAgent, self).__init__(action_space, summary_writer)
-    self._num_candidates = int(action_space.nvec[0])
-    self._W = tf.Variable(np.random.uniform(0, 10, size=(self._num_candidates, 3)), name='W')
-    self._sess = sess
-    self._return_idx = None
-    self._prev_pred_pr = None
-    self._opt = tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
-    self._alpha = alpha
-  
-    assert self._slate_size == 1
-  def step(self, reward, observation):
-    docs = observation['doc']
-    user = observation['user']
-    response = observation['response']
-
-    if self._return_idx != None and response != None:
-      # update w
-      y_true = [response[0]['recall']]
-      y_pred = self._prev_pred_pr
-      loss = tf.losses.binary_crossentropy(y_true, y_pred)
-      self._sess.run(self._opt.minimize(loss))
-    base_pr = self.calc_prs(user['time'], user['last_review'], user['history'], self._W)
-
-    time = user['time'] + 1
-    history_pos = user['history'].copy()
-    history_pos[:, [0, 1]] += 1 # add n, n+ by 1
-    history_neg = user['history'].copy()
-    history_neg[:, [0, 2]] += 1 # add n, n- by 1
-    last_review_now = np.repeat(user['time'], len(user['last_review']))
-    pr_pos = self.calc_prs(time, last_review_now, history_pos, self._W)
-    pr_neg = self.calc_prs(time, last_review_now, history_neg, self._W)
-
-    gain = (pr_pos + pr_neg) / 2 - base_pr
-    time_since_last_review = user['time'] - user['last_review']
-    uncertainty = self._alpha * tf.math.sqrt(tf.math.log(time_since_last_review) / user['history'][:, 0])
-    # print(gain.eval(session=self._sess))
-    # print(time_since_last_review)
-    # print(uncertainty.eval(session=self._sess))
-    ucb_score = gain + uncertainty
-    print("       gain:", gain.eval(session=self._sess))
-    print("uncertainty:", uncertainty.eval(session=self._sess))
-    best_idx = tf.argmax(ucb_score)
-
-    self._return_idx = self._sess.run(best_idx)
-    self._prev_pred_pr = base_pr[self._return_idx]
-    return [self._return_idx]
-
-    
-  def calc_prs(self, train_time, last_review, history, W):
-    last_review = train_time - last_review
-    mem_param = tf.math.exp(tf.reduce_sum(history * W, axis=1))
-    pr = tf.math.exp(-last_review / mem_param)
-    return pr