summaryrefslogtreecommitdiff
path: root/tests/examplefiles/test.pig
diff options
context:
space:
mode:
Diffstat (limited to 'tests/examplefiles/test.pig')
-rw-r--r--tests/examplefiles/test.pig148
1 files changed, 148 insertions, 0 deletions
diff --git a/tests/examplefiles/test.pig b/tests/examplefiles/test.pig
new file mode 100644
index 00000000..f67b0268
--- /dev/null
+++ b/tests/examplefiles/test.pig
@@ -0,0 +1,148 @@
+/**
+ * This script is an example recommender (using made up data) showing how you might modify item-item links
+ * by defining similar relations between items in a dataset and customizing the change in weighting.
+ * This example creates metadata by using the genre field as the metadata_field. The items with
+ * the same genre have it's weight cut in half in order to boost the signals of movies that do not have the same genre.
+ * This technique requires a customization of the standard GetItemItemRecommendations macro
+ */
+import 'recommenders.pig';
+
+
+
+%default INPUT_PATH_PURCHASES '../data/retail/purchases.json'
+%default INPUT_PATH_WISHLIST '../data/retail/wishlists.json'
+%default INPUT_PATH_INVENTORY '../data/retail/inventory.json'
+%default OUTPUT_PATH '../data/retail/out/modify_item_item'
+
+
+/******** Custom GetItemItemRecommnedations *********/
+define recsys__GetItemItemRecommendations_ModifyCustom(user_item_signals, metadata) returns item_item_recs {
+
+ -- Convert user_item_signals to an item_item_graph
+ ii_links_raw, item_weights = recsys__BuildItemItemGraph(
+ $user_item_signals,
+ $LOGISTIC_PARAM,
+ $MIN_LINK_WEIGHT,
+ $MAX_LINKS_PER_USER
+ );
+ -- NOTE this function is added in order to combine metadata with item-item links
+ -- See macro for more detailed explination
+ ii_links_metadata = recsys__AddMetadataToItemItemLinks(
+ ii_links_raw,
+ $metadata
+ );
+
+ /********* Custom Code starts here ********/
+
+ --The code here should adjust the weights based on an item-item link and the equality of metadata.
+ -- In this case, if the metadata is the same, the weight is reduced. Otherwise the weight is left alone.
+ ii_links_adjusted = foreach ii_links_metadata generate item_A, item_B,
+ -- the amount of weight adjusted is dependant on the domain of data and what is expected
+ -- It is always best to adjust the weight by multiplying it by a factor rather than addition with a constant
+ (metadata_B == metadata_A ? (weight * 0.5): weight) as weight;
+
+
+ /******** Custom Code stops here *********/
+
+ -- remove negative numbers just incase
+ ii_links_adjusted_filt = foreach ii_links_adjusted generate item_A, item_B,
+ (weight <= 0 ? 0: weight) as weight;
+ -- Adjust the weights of the graph to improve recommendations.
+ ii_links = recsys__AdjustItemItemGraphWeight(
+ ii_links_adjusted_filt,
+ item_weights,
+ $BAYESIAN_PRIOR
+ );
+
+ -- Use the item-item graph to create item-item recommendations.
+ $item_item_recs = recsys__BuildItemItemRecommendationsFromGraph(
+ ii_links,
+ $NUM_RECS_PER_ITEM,
+ $NUM_RECS_PER_ITEM
+ );
+};
+
+
+/******* Load Data **********/
+
+--Get purchase signals
+purchase_input = load '$INPUT_PATH_PURCHASES' using org.apache.pig.piggybank.storage.JsonLoader(
+ 'row_id: int,
+ movie_id: chararray,
+ movie_name: chararray,
+ user_id: chararray,
+ purchase_price: int');
+
+--Get wishlist signals
+wishlist_input = load '$INPUT_PATH_WISHLIST' using org.apache.pig.piggybank.storage.JsonLoader(
+ 'row_id: int,
+ movie_id: chararray,
+ movie_name: chararray,
+ user_id: chararray');
+
+
+/******* Convert Data to Signals **********/
+
+-- Start with choosing 1 as max weight for a signal.
+purchase_signals = foreach purchase_input generate
+ user_id as user,
+ movie_name as item,
+ 1.0 as weight;
+
+
+-- Start with choosing 0.5 as weight for wishlist items because that is a weaker signal than
+-- purchasing an item.
+wishlist_signals = foreach wishlist_input generate
+ user_id as user,
+ movie_name as item,
+ 0.5 as weight;
+
+user_signals = union purchase_signals, wishlist_signals;
+
+
+/******** Changes for Modifying item-item links ******/
+inventory_input = load '$INPUT_PATH_INVENTORY' using org.apache.pig.piggybank.storage.JsonLoader(
+ 'movie_title: chararray,
+ genres: bag{tuple(content:chararray)}');
+
+
+metadata = foreach inventory_input generate
+ FLATTEN(genres) as metadata_field,
+ movie_title as item;
+-- requires the macro to be written seperately
+ --NOTE this macro is defined within this file for clarity
+item_item_recs = recsys__GetItemItemRecommendations_ModifyCustom(user_signals, metadata);
+/******* No more changes ********/
+
+
+user_item_recs = recsys__GetUserItemRecommendations(user_signals, item_item_recs);
+
+--Completely unrelated code stuck in the middle
+data = LOAD 's3n://my-s3-bucket/path/to/responses'
+ USING org.apache.pig.piggybank.storage.JsonLoader();
+responses = FOREACH data GENERATE object#'response' AS response: map[];
+out = FOREACH responses
+ GENERATE response#'id' AS id: int, response#'thread' AS thread: chararray,
+ response#'comments' AS comments: {t: (comment: chararray)};
+STORE out INTO 's3n://path/to/output' USING PigStorage('|');
+
+
+/******* Store recommendations **********/
+
+-- If your output folder exists already, hadoop will refuse to write data to it.
+
+rmf $OUTPUT_PATH/item_item_recs;
+rmf $OUTPUT_PATH/user_item_recs;
+
+store item_item_recs into '$OUTPUT_PATH/item_item_recs' using PigStorage();
+store user_item_recs into '$OUTPUT_PATH/user_item_recs' using PigStorage();
+
+-- STORE the item_item_recs into dynamo
+STORE item_item_recs
+ INTO '$OUTPUT_PATH/unused-ii-table-data'
+USING com.mortardata.pig.storage.DynamoDBStorage('$II_TABLE', '$AWS_ACCESS_KEY_ID', '$AWS_SECRET_ACCESS_KEY');
+
+-- STORE the user_item_recs into dynamo
+STORE user_item_recs
+ INTO '$OUTPUT_PATH/unused-ui-table-data'
+USING com.mortardata.pig.storage.DynamoDBStorage('$UI_TABLE', '$AWS_ACCESS_KEY_ID', '$AWS_SECRET_ACCESS_KEY');