diff --git a/agent/__init__.py b/agent/__init__.py index 8016aed..fc8deeb 100644 --- a/agent/__init__.py +++ b/agent/__init__.py @@ -1 +1,2 @@ -from .util import create_agent_helper \ No newline at end of file +from .util import create_agent_helper +from .ucb_agent import UCBAgent \ No newline at end of file diff --git a/agent/ucb_agent.py b/agent/ucb_agent.py new file mode 100644 index 0000000..51f95ef --- /dev/null +++ b/agent/ucb_agent.py @@ -0,0 +1,59 @@ +from recsim.agent import AbstractEpisodicRecommenderAgent +import tensorflow as tf +import numpy as np + +class UCBAgent(AbstractEpisodicRecommenderAgent): + def __init__(self, sess, observation_space, action_space, eval_mode, alpha=1.0, learning_rate=0.001, summary_writer=None): + super(UCBAgent, self).__init__(action_space, summary_writer) + self._num_candidates = int(action_space.nvec[0]) + self._W = tf.Variable(np.random.normal(size=(self._num_candidates, 3)), name='W') + self._sess = sess + self._return_idx = None + self._prev_pred_pr = None + self._opt = tf.compat.v1.train.GradientDescentOptimizer(learning_rate) + self._alpha = alpha + + assert self._slate_size == 1 + def step(self, reward, observation): + docs = observation['doc'] + user = observation['user'] + response = observation['response'] + + if self._return_idx != None and response != None: + # update w + y_true = [response[0]['recall']] + y_pred = self._prev_pred_pr + loss = tf.losses.binary_crossentropy(y_true, y_pred) + self._sess.run(self._opt.minimize(loss)) + base_pr = self.calc_prs(user['time'], user['last_review'], user['history'], self._W) + + time = user['time'] + 1 + history_pos = user['history'].copy() + history_pos[:, [0, 1]] += 1 # add n, n+ by 1 + history_neg = user['history'].copy() + history_neg[:, [0, 2]] += 1 # add n, n- by 1 + last_review_now = np.repeat(user['time'], len(user['last_review'])) + pr_pos = self.calc_prs(time, last_review_now, history_pos, self._W) + pr_neg = self.calc_prs(time, last_review_now, history_neg, self._W) + + gain = (pr_pos + pr_neg) / 2 - base_pr + time_since_last_review = user['time'] - user['last_review'] + uncertainty = self._alpha * tf.math.sqrt(tf.math.log(time_since_last_review) / user['history'][:, 0]) + # print(gain.eval(session=self._sess)) + # print(time_since_last_review) + # print(uncertainty.eval(session=self._sess)) + ucb_score = gain + uncertainty + print(" gain:", gain.eval(session=self._sess)) + print("uncertainty:", uncertainty.eval(session=self._sess)) + best_idx = tf.argmax(ucb_score) + + self._return_idx = self._sess.run(best_idx) + self._prev_pred_pr = base_pr[self._return_idx] + return [self._return_idx] + + + def calc_prs(self, train_time, last_review, history, W): + last_review = train_time - last_review + mem_param = tf.math.exp(tf.reduce_sum(history * W, axis=1)) + pr = tf.math.exp(-last_review / mem_param) + return pr \ No newline at end of file