validated, runnable on gpu, loss, NDCG and hit rate looks okay

pmixer · pmixer · commit 54aa68f8b866 · 2020-09-20T00:18:17.000-04:00
diff --git a/main.py b/main.py
@@ -67,7 +67,7 @@ def str2bool(s):
         print('failed loading state_dicts, pls check file path: ', end="")
         print(args.state_dict_path)
 
-if args.inference_only or True:
+if args.inference_only:
     model.eval()
     t_test = evaluate(model, dataset, args)
     print('test (NDCG@10: %.4f, HR@10: %.4f)' % (t_test[0], t_test[1]))
diff --git a/model.py b/model.py
@@ -23,7 +23,7 @@ def forward(self, inputs):
 
 class TimeAwareMultiHeadAttention(torch.nn.Module):
     # required homebrewed mha layer for Ti/SASRec experiments
-    def __init__(self, hidden_size, head_num, dropout_rate):
+    def __init__(self, hidden_size, head_num, dropout_rate, dev):
         super(TimeAwareMultiHeadAttention, self).__init__()
         self.Q_w = torch.nn.Linear(hidden_size, hidden_size)
         self.K_w = torch.nn.Linear(hidden_size, hidden_size)
@@ -36,6 +36,7 @@ def __init__(self, hidden_size, head_num, dropout_rate):
         self.head_num = head_num
         self.head_size = hidden_size // head_num
         self.dropout_rate = dropout_rate
+        self.dev = dev
 
     def forward(self, queries, keys, time_mask, attn_mask, time_matrix_K, time_matrix_V, abs_pos_K, abs_pos_V):
         Q, K, V = self.Q_w(queries), self.K_w(keys), self.V_w(keys)
@@ -63,7 +64,8 @@ def forward(self, queries, keys, time_mask, attn_mask, time_matrix_K, time_matri
 
         time_mask = time_mask.unsqueeze(-1).expand(attn_weights.shape[0], -1, attn_weights.shape[-1])
         attn_mask = attn_mask.unsqueeze(0).expand(attn_weights.shape[0], -1, -1)
-        paddings = torch.ones(attn_weights.shape) *  FLOAT_MIN # float('-inf')
+        paddings = torch.ones(attn_weights.shape) *  -1e23 # float('-inf')
+        paddings = paddings.to(self.dev)
         attn_weights = torch.where(time_mask, paddings, attn_weights) # True:pick padding
         attn_weights = torch.where(attn_mask, paddings, attn_weights) # enforcing causality
 
@@ -119,7 +121,8 @@ def __init__(self, user_num, item_num, time_num, args):
 
             new_attn_layer = TimeAwareMultiHeadAttention(args.hidden_units,
                                                             args.num_heads,
-                                                            args.dropout_rate)
+                                                            args.dropout_rate,
+                                                            args.device)
             self.attention_layers.append(new_attn_layer)
 
             new_fwd_layernorm = torch.nn.LayerNorm(args.hidden_units, eps=1e-8)
diff --git a/utils.py b/utils.py
@@ -241,7 +241,7 @@ def evaluate(model, dataset, args):
         predictions = -model.predict(*[np.array(l) for l in [[u], [seq], [time_matrix],item_idx]])
         predictions = predictions[0]
 
-        rank = predictions.argsort().argsort()[0]
+        rank = predictions.argsort().argsort()[0].item()
 
         valid_user += 1
 
@@ -290,7 +290,7 @@ def evaluate_valid(model, dataset, args):
         predictions = -model.predict(*[np.array(l) for l in [[u], [seq], [time_matrix],item_idx]])
         predictions = predictions[0]
 
-        rank = predictions.argsort().argsort()[0]
+        rank = predictions.argsort().argsort()[0].item()
 
         valid_user += 1