|
| 1 | +Building datasets |
| 2 | +================= |
| 3 | + |
| 4 | +In this example, we'll use LightFM's built-in ``Dataset`` class to build |
| 5 | +an interaction dataset from raw data. The goal is to demonstrate how to |
| 6 | +go from raw data (lists of interactions and perhaps item and user |
| 7 | +features) to ``scipy.sparse`` matrices that can be used to fit a LightFM |
| 8 | +model. |
| 9 | + |
| 10 | +Getting the data |
| 11 | +---------------- |
| 12 | + |
| 13 | +We're going to use a sample from |
| 14 | +`Goodbooks-10k <https://github.com/zygmuntz/goodbooks-10k>`__ as our |
| 15 | +example dataset. Let's download the data first. |
| 16 | + |
| 17 | +.. code:: python |
| 18 | +
|
| 19 | + import os |
| 20 | + import zipfile |
| 21 | + import csv |
| 22 | +
|
| 23 | + import requests |
| 24 | +
|
| 25 | +
|
| 26 | + def _download(url: str, dest_path: str): |
| 27 | +
|
| 28 | + req = requests.get(url, stream=True) |
| 29 | + req.raise_for_status() |
| 30 | +
|
| 31 | + with open(dest_path, "wb") as fd: |
| 32 | + for chunk in req.iter_content(chunk_size=2 ** 20): |
| 33 | + fd.write(chunk) |
| 34 | +
|
| 35 | +
|
| 36 | + def get_data(): |
| 37 | +
|
| 38 | + ratings_url = ("http://www2.informatik.uni-freiburg.de/" "~cziegler/BX/BX-CSV-Dump.zip") |
| 39 | +
|
| 40 | + if not os.path.exists("data"): |
| 41 | + os.makedirs("data") |
| 42 | +
|
| 43 | + _download(ratings_url, "data/data.zip") |
| 44 | +
|
| 45 | + with zipfile.ZipFile("data/data.zip") as archive: |
| 46 | + return ( |
| 47 | + csv.DictReader( |
| 48 | + (x.decode("utf-8", "ignore") for x in archive.open("BX-Book-Ratings.csv")), |
| 49 | + delimiter=";", |
| 50 | + ), |
| 51 | + csv.DictReader( |
| 52 | + (x.decode("utf-8", "ignore") for x in archive.open("BX-Books.csv")), delimiter=";" |
| 53 | + ), |
| 54 | + ) |
| 55 | +
|
| 56 | +
|
| 57 | + def get_ratings(): |
| 58 | +
|
| 59 | + return get_data()[0] |
| 60 | +
|
| 61 | +
|
| 62 | + def get_book_features(): |
| 63 | +
|
| 64 | + return get_data()[1] |
| 65 | +
|
| 66 | +The data consists of book ratings and book details: |
| 67 | + |
| 68 | +.. code:: python |
| 69 | +
|
| 70 | + import json |
| 71 | + from itertools import islice |
| 72 | +
|
| 73 | + ratings, book_features = get_data() |
| 74 | +
|
| 75 | +Ratings look like this: |
| 76 | + |
| 77 | +.. code:: python |
| 78 | +
|
| 79 | + for line in islice(ratings, 2): |
| 80 | + print(json.dumps(line, indent=4)) |
| 81 | +
|
| 82 | +:: |
| 83 | + |
| 84 | + { |
| 85 | + "User-ID": "276725", |
| 86 | + "ISBN": "034545104X", |
| 87 | + "Book-Rating": "0" |
| 88 | + } |
| 89 | + { |
| 90 | + "User-ID": "276726", |
| 91 | + "ISBN": "0155061224", |
| 92 | + "Book-Rating": "5" |
| 93 | + } |
| 94 | + |
| 95 | +and book features look like this: |
| 96 | + |
| 97 | +.. code:: python |
| 98 | +
|
| 99 | + for line in islice(book_features, 1): |
| 100 | + print(json.dumps(line, indent=4)) |
| 101 | +
|
| 102 | +:: |
| 103 | + |
| 104 | + { |
| 105 | + "ISBN": "0195153448", |
| 106 | + "Book-Title": "Classical Mythology", |
| 107 | + "Book-Author": "Mark P. O. Morford", |
| 108 | + "Year-Of-Publication": "2002", |
| 109 | + "Publisher": "Oxford University Press", |
| 110 | + "Image-URL-S": |
| 111 | + "http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg", |
| 112 | + "Image-URL-M": |
| 113 | + "http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg", |
| 114 | + "Image-URL-L": |
| 115 | + "http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg" |
| 116 | + } |
| 117 | + |
| 118 | +Building the ID mappings |
| 119 | +------------------------ |
| 120 | + |
| 121 | +The first thing we need to do is to create a mapping between the user |
| 122 | +and item ids from our input data to indices that will be used internally |
| 123 | +by our model. |
| 124 | + |
| 125 | +We do this because LightFM works with user and item ids that are |
| 126 | +consecutive non-negative integers. The ``Dataset`` class allow us to |
| 127 | +create a mapping between the IDs we use in our systems and the |
| 128 | +consecutive indices preferred by the model. |
| 129 | + |
| 130 | +To do this, we create a dataset and call its ``fit`` method. The first |
| 131 | +argument is an iterable of all user ids in our data, and the second is |
| 132 | +an iterable of all item ids. In this case, we use generator expressions |
| 133 | +to lazily iterate over our data and yield user and item ids: |
| 134 | + |
| 135 | +.. code:: python |
| 136 | +
|
| 137 | + from lightfm.data import Dataset |
| 138 | +
|
| 139 | + dataset = Dataset() |
| 140 | + dataset.fit((x['User-ID'] for x in get_ratings()), |
| 141 | + (x['ISBN'] for x in get_ratings())) |
| 142 | +
|
| 143 | +This call will assign an internal numerical id to every user and item id |
| 144 | +we pass in. These will be contiguous (from 0 to however many users and |
| 145 | +items we have), and will also determine the dimensions of the resulting |
| 146 | +LightFM model. |
| 147 | + |
| 148 | +We can check that the mappings have been created by querying the dataset |
| 149 | +on how many users and books it knows about: |
| 150 | + |
| 151 | +.. code:: python |
| 152 | +
|
| 153 | + num_users, num_items = dataset.interactions_shape() |
| 154 | + print('Num users: {}, num_items {}.'.format(num_users, num_items)) |
| 155 | +
|
| 156 | +:: |
| 157 | + |
| 158 | + Num users: 105283, num_items 340553. |
| 159 | + |
| 160 | +Note that if we don't have all user and items ids at once, we can |
| 161 | +repeatedly call ``fit_partial`` to supply additional ids. In this case, |
| 162 | +we will use this capability to add some item feature mappings: |
| 163 | + |
| 164 | +.. code:: python |
| 165 | +
|
| 166 | + dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()), |
| 167 | + item_features=(x['Book-Author'] for x in get_book_features())) |
| 168 | +
|
| 169 | +This will create a feature for every unique author name in the dataset. |
| 170 | + |
| 171 | +(Note that we fit some more item ids: this is to make sure our mappings |
| 172 | +are complete even if there are items in the features dataset that are |
| 173 | +not in the interactions set.) |
| 174 | + |
| 175 | +Building the interactions matrix |
| 176 | +-------------------------------- |
| 177 | + |
| 178 | +Having created the mapping, we build the interaction matrix: |
| 179 | + |
| 180 | +.. code:: python |
| 181 | +
|
| 182 | + (interactions, weights) = dataset.build_interactions(((x['User-ID'], x['ISBN']) |
| 183 | + for x in get_ratings())) |
| 184 | +
|
| 185 | + print(repr(interactions)) |
| 186 | +
|
| 187 | +:: |
| 188 | + |
| 189 | + <105283x341762 sparse matrix of type '<class 'numpy.int32'>' |
| 190 | + with 1149780 stored elements in COOrdinate format> |
| 191 | + |
| 192 | +This is main input into a LightFM model: it encodes the interactions |
| 193 | +betwee users and items. |
| 194 | + |
| 195 | +Since we have item features, we can also create the item features |
| 196 | +matrix: |
| 197 | + |
| 198 | +.. code:: python |
| 199 | +
|
| 200 | + item_features = dataset.build_item_features(((x['ISBN'], [x['Book-Author']]) |
| 201 | + for x in get_book_features())) |
| 202 | + print(repr(item_features)) |
| 203 | +
|
| 204 | +:: |
| 205 | + |
| 206 | + <341762x443805 sparse matrix of type '<class 'numpy.float32'>' |
| 207 | + with 613141 stored elements in Compressed Sparse Row format> |
| 208 | + |
| 209 | +Building a model |
| 210 | +---------------- |
| 211 | + |
| 212 | +This is all we need to build a LightFM model: |
| 213 | + |
| 214 | +.. code:: python |
| 215 | +
|
| 216 | + from lightfm import LightFM |
| 217 | +
|
| 218 | + model = LightFM(loss='bpr') |
| 219 | + model.fit(interactions, item_features=item_features) |
| 220 | +
|
| 221 | +:: |
| 222 | + |
| 223 | + <lightfm.lightfm.LightFM at 0x7f5f0e8f7c88> |
0 commit comments