Compare commits
1 Commits
Author | SHA1 | Date | |
---|---|---|---|
84cb7f4ecf |
@ -1,2 +1,2 @@
|
|||||||
from .util import create_agent_helper
|
from .util import create_agent_helper
|
||||||
from .ucb_agent import UCBAgent
|
from .greedy import GreedyAgent
|
58
agent/greedy.py
Normal file
58
agent/greedy.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
from recsim.agent import AbstractEpisodicRecommenderAgent
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
class GreedyAgent(AbstractEpisodicRecommenderAgent):
|
||||||
|
def __init__(self, sess, observation_space, action_space, eval_mode, summary_writer):
|
||||||
|
super(GreedyAgent, self).__init__(action_space, summary_writer)
|
||||||
|
self._num_candidates = int(action_space.nvec[0])
|
||||||
|
self._W = np.array([[3, 1.5, 0.5]] * self._num_candidates)
|
||||||
|
assert self._slate_size == 1
|
||||||
|
def begin_episode(self, observation=None):
|
||||||
|
user = observation['user']
|
||||||
|
docs = observation['doc']
|
||||||
|
if 'W' in user: # use observable W
|
||||||
|
self._W = user['W']
|
||||||
|
else:
|
||||||
|
w = []
|
||||||
|
for doc_id in docs:
|
||||||
|
w.append(docs[doc_id])
|
||||||
|
self._W = np.array(w).reshape((-1, 3))
|
||||||
|
print("agent W:", self._W)
|
||||||
|
self._episode_num += 1
|
||||||
|
return self.step(0, observation)
|
||||||
|
def step(self, reward, observation):
|
||||||
|
docs = observation['doc']
|
||||||
|
user = observation['user']
|
||||||
|
|
||||||
|
base_pr = self.calc_prs(user['time'], user['last_review'], user['history'], self._W)
|
||||||
|
# np.exp(-last_review / np.exp(np.dot(W, x))).squeeze()
|
||||||
|
max_pr = -self._num_candidates
|
||||||
|
max_id = 0
|
||||||
|
for did in docs:
|
||||||
|
doc_id = int(did)
|
||||||
|
last_review = user['last_review'].copy()
|
||||||
|
history = user['history'].copy()
|
||||||
|
last_review[doc_id] = user['time']
|
||||||
|
time = user['time'] + 1
|
||||||
|
|
||||||
|
history[doc_id][0] += 1
|
||||||
|
history[doc_id][1] += 1
|
||||||
|
pr1 = self.calc_prs(time, last_review, history, self._W)
|
||||||
|
history[doc_id][1] -= 1
|
||||||
|
history[doc_id][2] += 1
|
||||||
|
pr2 = self.calc_prs(time, last_review, history, self._W)
|
||||||
|
pr = (pr1 + pr2) / 2 - base_pr
|
||||||
|
sum_pr = np.sum(pr)
|
||||||
|
if sum_pr > max_pr:
|
||||||
|
max_pr = sum_pr
|
||||||
|
max_id = doc_id
|
||||||
|
# print("pr1", pr1)
|
||||||
|
# print("pr2", pr2)
|
||||||
|
# print("pr0", base_pr)
|
||||||
|
print(f"choose doc{max_id} with marginal gain {max_pr}")
|
||||||
|
return [max_id]
|
||||||
|
def calc_prs(self, train_time, last_review, history, W):
|
||||||
|
last_review = train_time - last_review
|
||||||
|
mem_param = np.exp(np.einsum('ij,ij->i', history, W))
|
||||||
|
pr = np.exp(-last_review / mem_param)
|
||||||
|
return pr
|
@ -1,59 +0,0 @@
|
|||||||
from recsim.agent import AbstractEpisodicRecommenderAgent
|
|
||||||
import tensorflow as tf
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
class UCBAgent(AbstractEpisodicRecommenderAgent):
|
|
||||||
def __init__(self, sess, observation_space, action_space, eval_mode, alpha=1.0, learning_rate=0.001, summary_writer=None):
|
|
||||||
super(UCBAgent, self).__init__(action_space, summary_writer)
|
|
||||||
self._num_candidates = int(action_space.nvec[0])
|
|
||||||
self._W = tf.Variable(np.random.uniform(0, 10, size=(self._num_candidates, 3)), name='W')
|
|
||||||
self._sess = sess
|
|
||||||
self._return_idx = None
|
|
||||||
self._prev_pred_pr = None
|
|
||||||
self._opt = tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
|
|
||||||
self._alpha = alpha
|
|
||||||
|
|
||||||
assert self._slate_size == 1
|
|
||||||
def step(self, reward, observation):
|
|
||||||
docs = observation['doc']
|
|
||||||
user = observation['user']
|
|
||||||
response = observation['response']
|
|
||||||
|
|
||||||
if self._return_idx != None and response != None:
|
|
||||||
# update w
|
|
||||||
y_true = [response[0]['recall']]
|
|
||||||
y_pred = self._prev_pred_pr
|
|
||||||
loss = tf.losses.binary_crossentropy(y_true, y_pred)
|
|
||||||
self._sess.run(self._opt.minimize(loss))
|
|
||||||
base_pr = self.calc_prs(user['time'], user['last_review'], user['history'], self._W)
|
|
||||||
|
|
||||||
time = user['time'] + 1
|
|
||||||
history_pos = user['history'].copy()
|
|
||||||
history_pos[:, [0, 1]] += 1 # add n, n+ by 1
|
|
||||||
history_neg = user['history'].copy()
|
|
||||||
history_neg[:, [0, 2]] += 1 # add n, n- by 1
|
|
||||||
last_review_now = np.repeat(user['time'], len(user['last_review']))
|
|
||||||
pr_pos = self.calc_prs(time, last_review_now, history_pos, self._W)
|
|
||||||
pr_neg = self.calc_prs(time, last_review_now, history_neg, self._W)
|
|
||||||
|
|
||||||
gain = (pr_pos + pr_neg) / 2 - base_pr
|
|
||||||
time_since_last_review = user['time'] - user['last_review']
|
|
||||||
uncertainty = self._alpha * tf.math.sqrt(tf.math.log(time_since_last_review) / user['history'][:, 0])
|
|
||||||
# print(gain.eval(session=self._sess))
|
|
||||||
# print(time_since_last_review)
|
|
||||||
# print(uncertainty.eval(session=self._sess))
|
|
||||||
ucb_score = gain + uncertainty
|
|
||||||
print(" gain:", gain.eval(session=self._sess))
|
|
||||||
print("uncertainty:", uncertainty.eval(session=self._sess))
|
|
||||||
best_idx = tf.argmax(ucb_score)
|
|
||||||
|
|
||||||
self._return_idx = self._sess.run(best_idx)
|
|
||||||
self._prev_pred_pr = base_pr[self._return_idx]
|
|
||||||
return [self._return_idx]
|
|
||||||
|
|
||||||
|
|
||||||
def calc_prs(self, train_time, last_review, history, W):
|
|
||||||
last_review = train_time - last_review
|
|
||||||
mem_param = tf.math.exp(tf.reduce_sum(history * W, axis=1))
|
|
||||||
pr = tf.math.exp(-last_review / mem_param)
|
|
||||||
return pr
|
|
Loading…
x
Reference in New Issue
Block a user