summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorUnknown <catmiller@cat.local>2014-03-20 17:04:33 -0400
committerUnknown <catmiller@cat.local>2014-03-20 17:04:33 -0400
commitc3d5f1f0cac8ea3d1c666375d6acaa2a896bbb15 (patch)
treedc1935f33c88f06848a162a41087c6d3ae8ad56a
parent8f8ed8cf3e577fb07cd95c66a65b1aac1fa285d2 (diff)
downloadpygments-c3d5f1f0cac8ea3d1c666375d6acaa2a896bbb15.tar.gz
Create PigLexer.
-rw-r--r--AUTHORS1
-rw-r--r--pygments/lexers/_mapping.py1
-rw-r--r--pygments/lexers/jvm.py52
-rw-r--r--tests/examplefiles/test.pig148
4 files changed, 201 insertions, 1 deletions
diff --git a/AUTHORS b/AUTHORS
index ae77e7fb..fa34ec64 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -97,6 +97,7 @@ Other contributors, listed alphabetically, are:
* Brian McKenna -- F# lexer
* Charles McLaughlin -- Puppet lexer
* Lukas Meuser -- BBCode formatter, Lua lexer
+* Cat Miller -- Pig lexer
* Paul Miller -- LiveScript lexer
* Hong Minhee -- HTTP lexer
* Michael Mior -- Awk lexer
diff --git a/pygments/lexers/_mapping.py b/pygments/lexers/_mapping.py
index 3258c865..43b41c3a 100644
--- a/pygments/lexers/_mapping.py
+++ b/pygments/lexers/_mapping.py
@@ -236,6 +236,7 @@ LEXERS = {
'Perl6Lexer': ('pygments.lexers.agile', 'Perl6', ('perl6', 'pl6'), ('*.pl', '*.pm', '*.nqp', '*.p6', '*.6pl', '*.p6l', '*.pl6', '*.6pm', '*.p6m', '*.pm6', '*.t'), ('text/x-perl6', 'application/x-perl6')),
'PerlLexer': ('pygments.lexers.agile', 'Perl', ('perl', 'pl'), ('*.pl', '*.pm', '*.t'), ('text/x-perl', 'application/x-perl')),
'PhpLexer': ('pygments.lexers.web', 'PHP', ('php', 'php3', 'php4', 'php5'), ('*.php', '*.php[345]', '*.inc'), ('text/x-php',)),
+ 'PigLexer': ('pygments.lexers.jvm', 'Pig', ('pig',), ('*.pig',), ('text/x-pig',)),
'PikeLexer': ('pygments.lexers.compiled', 'Pike', ('pike',), ('*.pike', '*.pmod'), ('text/x-pike',)),
'PlPgsqlLexer': ('pygments.lexers.sql', 'PL/pgSQL', ('plpgsql',), (), ('text/x-plpgsql',)),
'PostScriptLexer': ('pygments.lexers.other', 'PostScript', ('postscript', 'postscr'), ('*.ps', '*.eps'), ('application/postscript',)),
diff --git a/pygments/lexers/jvm.py b/pygments/lexers/jvm.py
index 5c535142..3360ced3 100644
--- a/pygments/lexers/jvm.py
+++ b/pygments/lexers/jvm.py
@@ -20,7 +20,7 @@ from pygments import unistring as uni
__all__ = ['JavaLexer', 'ScalaLexer', 'GosuLexer', 'GosuTemplateLexer',
'GroovyLexer', 'IokeLexer', 'ClojureLexer', 'KotlinLexer',
- 'XtendLexer', 'AspectJLexer', 'CeylonLexer']
+ 'XtendLexer', 'AspectJLexer', 'CeylonLexer', 'PigLexer']
class JavaLexer(RegexLexer):
@@ -1066,3 +1066,53 @@ class XtendLexer(RegexLexer):
(r'.', String)
],
}
+
+class PigLexer(RegexLexer):
+ name = 'Pig'
+ aliases = ['pig']
+ filenames = ['*.pig']
+ mimetypes = ['text/x-pig']
+
+ flags = re.MULTILINE|re.IGNORECASE
+
+ tokens = {
+ 'root': [
+ (r'\s+', Text),
+ (r'--.*', Comment),
+ (r'/\*\*([^*][^/]*)/', Comment.Multiline),
+ (r'/\*\*.*\*\*/$', Comment),
+ (r'\\\n', Text),
+ (r'\\', Text),
+ (r'\'[^\'^\n]+\'', String),
+ (r'\"[^\"^\n]+\"', String),
+ include('keywords'),
+ include('types'),
+ include('builtins'),
+ (r'[0-9][0-9]*\.[0-9]+([eE][0-9]+)?[fd]?', Number.Float),
+ (r'0x[0-9a-fA-F]+', Number.Hex),
+ (r'[0-9]+L?', Number.Integer),
+ (r'\n', Text),
+ (r'([a-zA-Z_][a-zA-Z0-9_]*)(\s*)(\()',
+ bygroups(Name.Function, Text, Punctuation)),
+ (r'[()#:]', Text),
+ (r'[^(:\n#\'\")\s]+', Text),
+ (r'\S+\s+', Text)
+ ],
+ 'keywords': [
+ (r'(assert|and|any|all|arrange|as|asc|bag|by|cache|CASE|cat|cd|cp|'
+ r'%declare|%default|define|dense|desc|describe|distinct|du|dump|'
+ r'eval|exex|explain|filter|flatten|foreach|full|generate|group|help|'
+ r'if|illustrate|import|inner|input|into|is|join|kill|left|limit|load|'
+ r'ls|map|matches|mkdir|mv|not|null|onschema|or|order|outer|output|'
+ r'parallel|pig|pwd|quit|register|returns|right|rm|rmf|rollup|run|sample|'
+ r'set|ship|split|stderr|stdin|stdout|store|stream|through|union|using|void)\b', Keyword)
+ ],
+ 'builtins': [
+ (r'(AVG|BinStorage|cogroup|CONCAT|copyFromLocal|copyToLocal|COUNT|'
+ r'cross|DIFF|MAX|MIN|PigDump|PigStorage|SIZE|SUM|TextLoader|TOKENIZE)\b', Name.Builtin)
+ ],
+ 'types':[
+ (r'(bytearray|BIGINTEGER|BIGDECIMAL|chararray|datetime|double|float|'
+ r'int|long|tuple)\b', Keyword.Type)
+ ],
+ }
diff --git a/tests/examplefiles/test.pig b/tests/examplefiles/test.pig
new file mode 100644
index 00000000..f67b0268
--- /dev/null
+++ b/tests/examplefiles/test.pig
@@ -0,0 +1,148 @@
+/**
+ * This script is an example recommender (using made up data) showing how you might modify item-item links
+ * by defining similar relations between items in a dataset and customizing the change in weighting.
+ * This example creates metadata by using the genre field as the metadata_field. The items with
+ * the same genre have it's weight cut in half in order to boost the signals of movies that do not have the same genre.
+ * This technique requires a customization of the standard GetItemItemRecommendations macro
+ */
+import 'recommenders.pig';
+
+
+
+%default INPUT_PATH_PURCHASES '../data/retail/purchases.json'
+%default INPUT_PATH_WISHLIST '../data/retail/wishlists.json'
+%default INPUT_PATH_INVENTORY '../data/retail/inventory.json'
+%default OUTPUT_PATH '../data/retail/out/modify_item_item'
+
+
+/******** Custom GetItemItemRecommnedations *********/
+define recsys__GetItemItemRecommendations_ModifyCustom(user_item_signals, metadata) returns item_item_recs {
+
+ -- Convert user_item_signals to an item_item_graph
+ ii_links_raw, item_weights = recsys__BuildItemItemGraph(
+ $user_item_signals,
+ $LOGISTIC_PARAM,
+ $MIN_LINK_WEIGHT,
+ $MAX_LINKS_PER_USER
+ );
+ -- NOTE this function is added in order to combine metadata with item-item links
+ -- See macro for more detailed explination
+ ii_links_metadata = recsys__AddMetadataToItemItemLinks(
+ ii_links_raw,
+ $metadata
+ );
+
+ /********* Custom Code starts here ********/
+
+ --The code here should adjust the weights based on an item-item link and the equality of metadata.
+ -- In this case, if the metadata is the same, the weight is reduced. Otherwise the weight is left alone.
+ ii_links_adjusted = foreach ii_links_metadata generate item_A, item_B,
+ -- the amount of weight adjusted is dependant on the domain of data and what is expected
+ -- It is always best to adjust the weight by multiplying it by a factor rather than addition with a constant
+ (metadata_B == metadata_A ? (weight * 0.5): weight) as weight;
+
+
+ /******** Custom Code stops here *********/
+
+ -- remove negative numbers just incase
+ ii_links_adjusted_filt = foreach ii_links_adjusted generate item_A, item_B,
+ (weight <= 0 ? 0: weight) as weight;
+ -- Adjust the weights of the graph to improve recommendations.
+ ii_links = recsys__AdjustItemItemGraphWeight(
+ ii_links_adjusted_filt,
+ item_weights,
+ $BAYESIAN_PRIOR
+ );
+
+ -- Use the item-item graph to create item-item recommendations.
+ $item_item_recs = recsys__BuildItemItemRecommendationsFromGraph(
+ ii_links,
+ $NUM_RECS_PER_ITEM,
+ $NUM_RECS_PER_ITEM
+ );
+};
+
+
+/******* Load Data **********/
+
+--Get purchase signals
+purchase_input = load '$INPUT_PATH_PURCHASES' using org.apache.pig.piggybank.storage.JsonLoader(
+ 'row_id: int,
+ movie_id: chararray,
+ movie_name: chararray,
+ user_id: chararray,
+ purchase_price: int');
+
+--Get wishlist signals
+wishlist_input = load '$INPUT_PATH_WISHLIST' using org.apache.pig.piggybank.storage.JsonLoader(
+ 'row_id: int,
+ movie_id: chararray,
+ movie_name: chararray,
+ user_id: chararray');
+
+
+/******* Convert Data to Signals **********/
+
+-- Start with choosing 1 as max weight for a signal.
+purchase_signals = foreach purchase_input generate
+ user_id as user,
+ movie_name as item,
+ 1.0 as weight;
+
+
+-- Start with choosing 0.5 as weight for wishlist items because that is a weaker signal than
+-- purchasing an item.
+wishlist_signals = foreach wishlist_input generate
+ user_id as user,
+ movie_name as item,
+ 0.5 as weight;
+
+user_signals = union purchase_signals, wishlist_signals;
+
+
+/******** Changes for Modifying item-item links ******/
+inventory_input = load '$INPUT_PATH_INVENTORY' using org.apache.pig.piggybank.storage.JsonLoader(
+ 'movie_title: chararray,
+ genres: bag{tuple(content:chararray)}');
+
+
+metadata = foreach inventory_input generate
+ FLATTEN(genres) as metadata_field,
+ movie_title as item;
+-- requires the macro to be written seperately
+ --NOTE this macro is defined within this file for clarity
+item_item_recs = recsys__GetItemItemRecommendations_ModifyCustom(user_signals, metadata);
+/******* No more changes ********/
+
+
+user_item_recs = recsys__GetUserItemRecommendations(user_signals, item_item_recs);
+
+--Completely unrelated code stuck in the middle
+data = LOAD 's3n://my-s3-bucket/path/to/responses'
+ USING org.apache.pig.piggybank.storage.JsonLoader();
+responses = FOREACH data GENERATE object#'response' AS response: map[];
+out = FOREACH responses
+ GENERATE response#'id' AS id: int, response#'thread' AS thread: chararray,
+ response#'comments' AS comments: {t: (comment: chararray)};
+STORE out INTO 's3n://path/to/output' USING PigStorage('|');
+
+
+/******* Store recommendations **********/
+
+-- If your output folder exists already, hadoop will refuse to write data to it.
+
+rmf $OUTPUT_PATH/item_item_recs;
+rmf $OUTPUT_PATH/user_item_recs;
+
+store item_item_recs into '$OUTPUT_PATH/item_item_recs' using PigStorage();
+store user_item_recs into '$OUTPUT_PATH/user_item_recs' using PigStorage();
+
+-- STORE the item_item_recs into dynamo
+STORE item_item_recs
+ INTO '$OUTPUT_PATH/unused-ii-table-data'
+USING com.mortardata.pig.storage.DynamoDBStorage('$II_TABLE', '$AWS_ACCESS_KEY_ID', '$AWS_SECRET_ACCESS_KEY');
+
+-- STORE the user_item_recs into dynamo
+STORE user_item_recs
+ INTO '$OUTPUT_PATH/unused-ui-table-data'
+USING com.mortardata.pig.storage.DynamoDBStorage('$UI_TABLE', '$AWS_ACCESS_KEY_ID', '$AWS_SECRET_ACCESS_KEY');