Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions app-requirements.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
- The /toggle endpoint updates the users recommendation based on the most popular items in the city.
If you look at the /toggle endpoint. That's what runs the process that generates the user recommendations.

- There is a boolean switch that changes the process that is ran (popular/machine learning).
We need to have your recommendation algorithm implementation ran everytime that endpoint is called.


### So we need the following:

1. Functions to query all the data you need for your algorithm (in order for this to work in production we need to be careful with memory. Does your algorithm algorithm work in chunks? Can it work by processing 10,000 rows of data at a time. Or does it need all the data at once? Another solution would be to use a cluster computing framework, which might be the better way to go about it)

2. Function that takes in the queried data and begins the recommendation algorithm you've made

3. Function that populates the UserRecommendation table and associates the UserRecommendation ID with it's corresponding Users. (when I was doing research it mentioned that some users are likely to get very similar recommendations. So in order to save data for production they would give the same recommendations to three users that are very similar. So in the implementation the UserRecommendation ID can be associated with more than 1 user. This doesn't have to be the case if you don't want, we can have a OnetoOne relationship with UserRecommendation and the User). How the popular recommendation currently works is: it creates one UserRecommendation object, and it gives that ID to every User in our database.
2 changes: 1 addition & 1 deletion app/main/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

def create_app(config_name):
app = Flask(__name__)
DB.init()
DB.__init__()
app.config.from_object(config_by_name[config_name])
flask_bcrypt.init_app(app)
return app
2 changes: 1 addition & 1 deletion app/main/controller/user_recommendation_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def get(self):

@api.route('/v1')
class Recommendation(Resource):

@api.doc('Gets a specific user\'s product recommendations')
def get(self):
user_id = int(request.args.get('userId', None))
Expand Down
4 changes: 3 additions & 1 deletion app/main/dataModel/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,17 @@ class Category(object):
COLLECTION = "categories"

def __init__(self, id, name):
self.id = id;
self.id = id
self.name = name
self.added_on = time.time()
self.last_updated = self.added_on

# adds the categorical info if not found in the database.
def insert(self):
if not DB.find_one(Category.COLLECTION, {"_id": self.id}):
DB.insert(collection=Category.COLLECTION, data=self.json())

# return the info in json format.
def json(self):
return {
'_id': self.id,
Expand Down
2 changes: 1 addition & 1 deletion app/main/dataModel/merchant_product.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class Merchant_Product(object):
COLLECTION = "merchant_products"

def __init__(self, id, merchant_id, product_id, price, currency, discounted_price):
self.id = id;
self.id = id
self.merchant_id = merchant_id
self.product_id = product_id
self.price = price
Expand Down
8 changes: 7 additions & 1 deletion app/main/dataModel/user.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,15 @@ def json(self):
'recommendation_id': self.recommendation_id,
'city_id': self.city_id
}

# I dont see why there is no need for insert of the user data? or was it missed ?
def insert(self):
if not DB.find_one(User.COLLECTION, {"_id": self.id}):
DB.insert(collection=User.COLLECTION, data=self.json())

# to get the id of the user ?. TO do find the use of this function. nOT SURE WHAT TO RETURN HERE.
def get_id(self):
self.getId()
return self.id

def set_id(self, id):
self.id = id
6 changes: 3 additions & 3 deletions app/main/dataModel/user_purchase.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ def insert(self):
def json(self):
return {
'_id': self.id,
'user_id': self.name,
'product_id': self.retail_price,
'purchased_count': self.categories,
'user_id': self.user_id,
'product_id': self.product_id,
'purchased_count': self.purchased_count,
'added_on': self.added_on,
'last_updated': self.last_updated
}
3 changes: 2 additions & 1 deletion app/main/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ class DB(object):
DB_NAME = "recommendation"

@staticmethod
def init():
def __init__():
# connecting to the database.
client = pymongo.MongoClient(DB.URI)
DB.DATABASE = client[DB.DB_NAME]

Expand Down
266 changes: 266 additions & 0 deletions app/main/model/vowpal_wabbit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

"""
This file provides a wrapper to run Vowpal Wabbit from the command line through python.
It is not recommended to use this approach in production, there are python bindings that can be installed from the
repository or pip or the command line can be used. This is merely to demonstrate vw usage in the example notebooks.
"""

import os
from subprocess import run
from tempfile import TemporaryDirectory
import pandas as pd

from reco_utils.common.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL,
DEFAULT_TIMESTAMP_COL,
DEFAULT_PREDICTION_COL,
)


class VW:
"""Vowpal Wabbit Class"""

def __init__(
self,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_timestamp=DEFAULT_TIMESTAMP_COL,
col_prediction=DEFAULT_PREDICTION_COL,
**kwargs,
):
"""Initialize model parameters

Args:
col_user (str): user column name
col_item (str): item column name
col_rating (str): rating column name
col_timestamp (str): timestamp column name
col_prediction (str): prediction column name
"""

# create temporary files
self.tempdir = TemporaryDirectory()
self.train_file = os.path.join(self.tempdir.name, "train.dat")
self.test_file = os.path.join(self.tempdir.name, "test.dat")
self.model_file = os.path.join(self.tempdir.name, "vw.model")
self.prediction_file = os.path.join(self.tempdir.name, "prediction.dat")

# set DataFrame columns
self.col_user = col_user
self.col_item = col_item
self.col_rating = col_rating
self.col_timestamp = col_timestamp
self.col_prediction = col_prediction

self.logistic = "logistic" in kwargs.values()
self.train_cmd = self.parse_train_params(params=kwargs)
self.test_cmd = self.parse_test_params(params=kwargs)

@staticmethod
def to_vw_cmd(params):
"""Convert dictionary of parameters to vw command line.

Args:
params (dict): key = parameter, value = value (use True if parameter is just a flag)

Returns:
list[str]: vw command line parameters as list of strings
"""

cmd = ["vw"]
for k, v in params.items():
if v is False:
# don't add parameters with a value == False
continue

# add the correct hyphen to the parameter
cmd.append(f"-{k}" if len(k) == 1 else f"--{k}")
if v is not True:
# don't add an argument for parameters with value == True
cmd.append("{}".format(v))

return cmd

def parse_train_params(self, params):
"""Parse input hyper-parameters to build vw train commands

Args:
params (dict): key = parameter, value = value (use True if parameter is just a flag)

Returns:
list[str]: vw command line parameters as list of strings
"""

# make a copy of the original hyper parameters
train_params = params.copy()

# remove options that are handled internally, not supported, or test only parameters
invalid = [
"data",
"final_regressor",
"invert_hash",
"readable_model",
"t",
"testonly",
"i",
"initial_regressor",
"link",
]

for option in invalid:
if option in train_params:
del train_params[option]

train_params.update(
{
"d": self.train_file,
"f": self.model_file,
"quiet": params.get("quiet", True),
}
)
return self.to_vw_cmd(params=train_params)

def parse_test_params(self, params):
"""Parse input hyper-parameters to build vw test commands

Args:
params (dict): key = parameter, value = value (use True if parameter is just a flag)

Returns:
list[str]: vw command line parameters as list of strings
"""

# make a copy of the original hyper parameters
test_params = params.copy()

# remove options that are handled internally, ot supported or train only parameters
invalid = [
"data",
"f",
"final_regressor",
"initial_regressor",
"test_only",
"invert_hash",
"readable_model",
"b",
"bit_precision",
"holdout_off",
"c",
"cache",
"k",
"kill_cache",
"l",
"learning_rate",
"l1",
"l2",
"initial_t",
"power_t",
"decay_learning_rate",
"q",
"quadratic",
"cubic",
"i",
"interactions",
"rank",
"lrq",
"lrqdropout",
"oaa",
]
for option in invalid:
if option in test_params:
del test_params[option]

test_params.update(
{
"d": self.test_file,
"i": self.model_file,
"quiet": params.get("quiet", True),
"p": self.prediction_file,
"t": True,
}
)
return self.to_vw_cmd(params=test_params)

def to_vw_file(self, df, train=True):
"""Convert Pandas DataFrame to vw input format file

Args:
df (pd.DataFrame): input DataFrame
train (bool): flag for train mode (or test mode if False)
"""

output = self.train_file if train else self.test_file
with open(output, "w") as f:
# extract columns and create a new dataframe
tmp = df[[self.col_rating, self.col_user, self.col_item]].reset_index()

if train:
# we need to reset the rating type to an integer to simplify the vw formatting
tmp[self.col_rating] = tmp[self.col_rating].astype("int64")

# convert rating to binary value
if self.logistic:
max_value = tmp[self.col_rating].max()
tmp[self.col_rating] = tmp[self.col_rating].apply(
lambda x: 2 * round(x / max_value) - 1
)
else:
tmp[self.col_rating] = ""

# convert each row to VW input format (https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Input-format)
# [label] [tag]|[user namespace] [user id feature] |[item namespace] [movie id feature]
# label is the true rating, tag is a unique id for the example just used to link predictions to truth
# user and item namespaces separate features to support interaction features through command line options
for _, row in tmp.iterrows():
f.write(
"{rating} {index}|user {userID} |item {itemID}\n".format(
rating=row[self.col_rating],
index=row["index"],
userID=row[self.col_user],
itemID=row[self.col_item],
)
)

def fit(self, df):
"""Train model

Args:
df (pd.DataFrame): input training data
"""

# write dataframe to disk in vw format
self.to_vw_file(df=df)

# train model
run(self.train_cmd, check=True)

def predict(self, df):
"""Predict results

Args:
df (pd.DataFrame): input test data
"""

# write dataframe to disk in vw format
self.to_vw_file(df=df, train=False)

# generate predictions
run(self.test_cmd, check=True)

# read predictions
return df.join(
pd.read_csv(
self.prediction_file,
delim_whitespace=True,
names=[self.col_prediction],
index_col=1,
)
)

def __del__(self):
self.tempdir.cleanup()
Loading