Repository metrics
- Stars
- (17,706 stars)
- PR merge metrics
- (平均マージ 6d 16h) (30d で 10 merged PRs)
説明
Description
Hello all, i'm trying to use the NCF_deep_dive notebook with my own data. With the following structure
| usr_id | code_id | amt_trx | bestelldatum | |
|---|---|---|---|---|
| 0 | 0 | 35 | 1 | 2022-03-01 |
| 1 | 0 | 2 | 1 | 2022-03-01 |
| 2 | 0 | 18 | 1 | 2022-03-01 |
| 3 | 0 | 9 | 1 | 2022-03-01 |
| 4 | 0 | 0 | 1 | 2022-03-01 |
when I try to create the dataset i get the following error
data = NCFDataset(train_file=train_file, test_file=leave_one_out_test_file, seed=SEED, overwrite_test_file_full=True, col_user='usr_id', col_item='code_id', col_rating='amt_trx', binary=False)
---------------------------------------------------------------------------
MissingUserException Traceback (most recent call last)
Cell In [39], line 1
----> 1 data = NCFDataset(train_file=train_file,
2 test_file=leave_one_out_test_file,
3 seed=SEED,
4 overwrite_test_file_full=True,
5 col_user='usr_id',
6 col_item='code_id',
7 col_rating='amt_trx',
8 binary=False)
File /anaconda/envs/recsys/lib/python3.8/site-packages/recommenders/models/ncf/dataset.py:376, in Dataset.__init__(self, train_file, test_file, test_file_full, overwrite_test_file_full, n_neg, n_neg_test, col_user, col_item, col_rating, binary, seed, sample_with_replacement, print_warnings)
374 self.test_file_full = os.path.splitext(self.test_file)[0] + "_full.csv"
375 if self.overwrite_test_file_full or not os.path.isfile(self.test_file_full):
--> 376 self._create_test_file()
377 self.test_full_datafile = DataFile(
378 filename=self.test_file_full,
379 col_user=self.col_user,
(...)
383 binary=self.binary,
384 )
385 # set random seed
File /anaconda/envs/recsys/lib/python3.8/site-packages/recommenders/models/ncf/dataset.py:417, in Dataset._create_test_file(self)
415 if user in train_datafile.users:
416 user_test_data = test_datafile.load_data(user)
--> 417 user_train_data = train_datafile.load_data(user)
418 # for leave-one-out evaluation, exclude items seen in both training and test sets
419 # when sampling negatives
420 user_positive_item_pool = set(
421 user_test_data[self.col_item].unique()
422 ).union(user_train_data[self.col_item].unique())
File /anaconda/envs/recsys/lib/python3.8/site-packages/recommenders/models/ncf/dataset.py:194, in DataFile.load_data(self, key, by_user)
192 while (self.line_num == 0) or (self.row[key_col] != key):
193 if self.end_of_file:
--> 194 raise MissingUserException("User {} not in file {}".format(key, self.filename))
195 next(self)
196 # collect user/test batch data
MissingUserException: User 58422 not in file ./train_new.csv
I made some checks print(train.usr_id.nunique()) --> output: 81062 print(test.usr_id.nunique()) --> output: 81062 print(leave.usr_id.nunique()) --> output: 81062
also checked by hand and the user 58422 is in all the files. Also the types are the same i'm using int64 for usr_id, code_id and amt_trx like movielens dataset
I can't understand the error, could you help me please?
Update
If i remove the parameter overwrite_test_file_full it creates the dataset but then I can't make predictions because the dataset object didn't create the user2id mapping
data = NCFDataset(train_file=train_file,
test_file=leave_one_out_test_file,
seed=SEED,
col_user='usr_id',
col_item='code_id',
col_rating='amt_trx',
print_warnings=True)
model = NCF (
n_users=data.n_users,
n_items=data.n_items,
model_type="NeuMF",
n_factors=4,
layer_sizes=[16,8,4],
n_epochs=EPOCHS,
batch_size=BATCH_SIZE,
learning_rate=1e-3,
verbose=99,
seed=SEED
)
predictions = [[row.usr_id, row.code_id, model.predict(row.usr_id, row.code_id)]
for (_, row) in test.iterrows()]
predictions = pd.DataFrame(predictions, columns=['usr_id', 'code_id', 'prediction'])
predictions.head()
AttributeError Traceback (most recent call last)
Cell In [38], line 1
----> 1 predictions = [[row.usr_id, row.code_id, model.predict(row.usr_id, row.code_id)]
2 for (_, row) in test.iterrows()]
5 predictions = pd.DataFrame(predictions, columns=['usr_id', 'code_id', 'prediction'])
6 predictions.head()
Cell In [38], line 1, in <listcomp>(.0)
----> 1 predictions = [[row.usr_id, row.code_id, model.predict(row.usr_id, row.code_id)]
2 for (_, row) in test.iterrows()]
5 predictions = pd.DataFrame(predictions, columns=['usr_id', 'code_id', 'prediction'])
6 predictions.head()
File /anaconda/envs/recsys/lib/python3.8/site-packages/recommenders/models/ncf/ncf_singlenode.py:434, in NCF.predict(self, user_input, item_input, is_list)
431 return list(output.reshape(-1))
433 else:
--> 434 output = self._predict(np.array([user_input]), np.array([item_input]))
435 return float(output.reshape(-1)[0])
File /anaconda/envs/recsys/lib/python3.8/site-packages/recommenders/models/ncf/ncf_singlenode.py:440, in NCF._predict(self, user_input, item_input)
437 def _predict(self, user_input, item_input):
438
439 # index converting
--> 440 user_input = np.array([self.user2id[x] for x in user_input])
441 item_input = np.array([self.item2id[x] for x in item_input])
443 # get feed dict
File /anaconda/envs/recsys/lib/python3.8/site-packages/recommenders/models/ncf/ncf_singlenode.py:440, in <listcomp>(.0)
437 def _predict(self, user_input, item_input):
438
439 # index converting
--> 440 user_input = np.array([self.user2id[x] for x in user_input])
441 item_input = np.array([self.item2id[x] for x in item_input])
443 # get feed dict
AttributeError: 'NCF' object has no attribute 'user2id'