ucb agent test

2023-10-29 01:43:35 +08:00
parent 84a08b08ee
commit fe56de9d76
2 changed files with 61 additions and 1 deletions
@@ -1 +1,2 @@
 from .util import create_agent_helper
+from .ucb_agent import UCBAgent
@@ -0,0 +1,59 @@
+from recsim.agent import AbstractEpisodicRecommenderAgent
+import tensorflow as tf
+import numpy as np
+
+class UCBAgent(AbstractEpisodicRecommenderAgent):
+  def __init__(self, sess, observation_space, action_space, eval_mode, alpha=1.0, learning_rate=0.001, summary_writer=None):
+    super(UCBAgent, self).__init__(action_space, summary_writer)
+    self._num_candidates = int(action_space.nvec[0])
+    self._W = tf.Variable(np.random.normal(size=(self._num_candidates, 3)), name='W')
+    self._sess = sess
+    self._return_idx = None
+    self._prev_pred_pr = None
+    self._opt = tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
+    self._alpha = alpha
+  
+    assert self._slate_size == 1
+  def step(self, reward, observation):
+    docs = observation['doc']
+    user = observation['user']
+    response = observation['response']
+
+    if self._return_idx != None and response != None:
+      # update w
+      y_true = [response[0]['recall']]
+      y_pred = self._prev_pred_pr
+      loss = tf.losses.binary_crossentropy(y_true, y_pred)
+      self._sess.run(self._opt.minimize(loss))
+    base_pr = self.calc_prs(user['time'], user['last_review'], user['history'], self._W)
+
+    time = user['time'] + 1
+    history_pos = user['history'].copy()
+    history_pos[:, [0, 1]] += 1 # add n, n+ by 1
+    history_neg = user['history'].copy()
+    history_neg[:, [0, 2]] += 1 # add n, n- by 1
+    last_review_now = np.repeat(user['time'], len(user['last_review']))
+    pr_pos = self.calc_prs(time, last_review_now, history_pos, self._W)
+    pr_neg = self.calc_prs(time, last_review_now, history_neg, self._W)
+
+    gain = (pr_pos + pr_neg) / 2 - base_pr
+    time_since_last_review = user['time'] - user['last_review']
+    uncertainty = self._alpha * tf.math.sqrt(tf.math.log(time_since_last_review) / user['history'][:, 0])
+    # print(gain.eval(session=self._sess))
+    # print(time_since_last_review)
+    # print(uncertainty.eval(session=self._sess))
+    ucb_score = gain + uncertainty
+    print("       gain:", gain.eval(session=self._sess))
+    print("uncertainty:", uncertainty.eval(session=self._sess))
+    best_idx = tf.argmax(ucb_score)
+
+    self._return_idx = self._sess.run(best_idx)
+    self._prev_pred_pr = base_pr[self._return_idx]
+    return [self._return_idx]
+
+    
+  def calc_prs(self, train_time, last_review, history, W):
+    last_review = train_time - last_review
+    mem_param = tf.math.exp(tf.reduce_sum(history * W, axis=1))
+    pr = tf.math.exp(-last_review / mem_param)
+    return pr