1 files changed, 128 insertions, 0 deletions
diff --git a/src/examples/btpyparse.py b/src/examples/btpyparse.py
new file mode 100644
index 0000000..9700ff2
--- /dev/null
+++ b/src/examples/btpyparse.py
@@ -0,0 +1,128 @@
+""" Pyparsing parser for BibTeX files
+
+A standalone parser using pyparsing.
+
+pyparsing has a simple and expressive syntax so the grammar is easy to read and
+write.
+
+Matthew Brett 2010
+Simplified BSD license
+"""
+
+from pyparsing import (Regex, Suppress, ZeroOrMore, Group, Optional, Forward,
+                       SkipTo, CaselessLiteral, Dict)
+
+
+class Macro(object):
+    """ Class to encapsulate undefined macro references """
+    def __init__(self, name):
+        self.name = name
+    def __repr__(self):
+        return 'Macro("%s")' % self.name
+    def __eq__(self, other):
+        return self.name == other.name
+    def __ne__(self, other):
+        return self.name != other.name
+
+
+# Character literals
+LCURLY,RCURLY,LPAREN,RPAREN,QUOTE,COMMA,AT,EQUALS,HASH = map(Suppress,'{}()",@=#')
+
+
+def bracketed(expr):
+    """ Return matcher for `expr` between curly brackets or parentheses """
+    return (LPAREN + expr + RPAREN) | (LCURLY + expr + RCURLY)
+
+
+# Define parser components for strings (the hard bit)
+chars_no_curly = Regex(r"[^{}]+")
+chars_no_curly.leaveWhitespace()
+chars_no_quotecurly = Regex(r'[^"{}]+')
+chars_no_quotecurly.leaveWhitespace()
+# Curly string is some stuff without curlies, or nested curly sequences
+curly_string = Forward()
+curly_item = Group(curly_string) | chars_no_curly
+curly_string << LCURLY + ZeroOrMore(curly_item) + RCURLY
+# quoted string is either just stuff within quotes, or stuff within quotes, within
+# which there is nested curliness
+quoted_item = Group(curly_string) | chars_no_quotecurly
+quoted_string = QUOTE + ZeroOrMore(quoted_item) + QUOTE
+
+# Numbers can just be numbers. Only integers though.
+number = Regex('[0-9]+')
+
+# Basis characters (by exclusion) for variable / field names.  The following
+# list of characters is from the btparse documentation
+any_name = Regex('[^\s"#%\'(),={}]+')
+
+# btparse says, and the test bibs show by experiment, that macro and field names
+# cannot start with a digit.  In fact entry type names cannot start with a digit
+# either (see tests/bibs). Cite keys can start with a digit
+not_digname = Regex('[^\d\s"#%\'(),={}][^\s"#%\'(),={}]*')
+
+# Comment comments out to end of line
+comment = (AT + CaselessLiteral('comment') +
+           Regex("[\s{(].*").leaveWhitespace())
+
+# The name types with their digiteyness
+not_dig_lower = not_digname.copy().setParseAction(lambda t: t[0].lower())
+macro_def = not_dig_lower.copy()
+macro_ref = not_dig_lower.copy().setParseAction(lambda t : Macro(t[0].lower()))
+field_name = not_dig_lower.copy()
+# Spaces in names mean they cannot clash with field names
+entry_type = not_dig_lower('entry_type')
+cite_key = any_name('cite_key')
+# Number has to be before macro name
+string = (number | macro_ref | quoted_string | curly_string)
+
+# There can be hash concatenation
+field_value = string + ZeroOrMore(HASH + string)
+field_def = Group(field_name + EQUALS + field_value)
+entry_contents = Dict(ZeroOrMore(field_def + COMMA) + Optional(field_def))
+
+# Entry is surrounded either by parentheses or curlies
+entry = (AT + entry_type + bracketed(cite_key + COMMA + entry_contents))
+
+# Preamble is a macro-like thing with no name
+preamble = AT + CaselessLiteral('preamble') + bracketed(field_value)
+
+# Macros (aka strings)
+macro_contents = macro_def + EQUALS + field_value
+macro = AT + CaselessLiteral('string') + bracketed(macro_contents)
+
+# Implicit comments
+icomment = SkipTo('@').setParseAction(lambda t : t.insert(0, 'icomment'))
+
+# entries are last in the list (other than the fallback) because they have
+# arbitrary start patterns that would match comments, preamble or macro
+definitions = Group(comment |
+                    preamble |
+                    macro |
+                    entry |
+                    icomment)
+
+# Start symbol
+bibfile = ZeroOrMore(definitions)
+
+
+def parse_str(str):
+    return bibfile.parseString(str)
+
+
+if __name__ == '__main__':
+    # Run basic test
+    txt = """
+Some introductory text
+(implicit comment)
+
+@ARTICLE{Authors2011,
+  author = {First Author and Second Author and Third Author},
+  title = {An article about {S}omething},
+  journal = "Journal of Articles",
+  year = {2011},
+  volume = {16},
+  pages = {1140--1141},
+  number = {2}
+}
+"""
+    print '\n\n'.join(defn.dump() for defn in parse_str(txt))