diff options
author | José Valim <jose.valim@plataformatec.com.br> | 2017-05-25 19:54:04 +0200 |
---|---|---|
committer | José Valim <jose.valim@plataformatec.com.br> | 2017-05-26 16:50:57 +0200 |
commit | b41ddd960ae7301815174b1a0193a9f6465bf787 (patch) | |
tree | 514f8acecfa7bb288eaaa2e5022487bf9a1334f0 | |
parent | bb274ee00540d50a899098fd5c9b0900ac1aa631 (diff) | |
download | elixir-b41ddd960ae7301815174b1a0193a9f6465bf787.tar.gz |
Add Unicode Syntax specification
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | lib/elixir/pages/Unicode Syntax.md | 58 |
2 files changed, 59 insertions, 1 deletions
@@ -147,7 +147,7 @@ docs: compile ../ex_doc/bin/ex_doc docs_elixir docs_eex docs_mix docs_iex docs_e docs_elixir: compile ../ex_doc/bin/ex_doc @ echo "==> ex_doc (elixir)" $(Q) rm -rf doc/elixir - $(call COMPILE_DOCS,Elixir,elixir,Kernel,-e "lib/elixir/pages/Behaviours.md" -e "lib/elixir/pages/Deprecations.md" -e "lib/elixir/pages/Guards.md" -e "lib/elixir/pages/Naming Conventions.md" -e "lib/elixir/pages/Operators.md" -e "lib/elixir/pages/Syntax Reference.md" -e "lib/elixir/pages/Typespecs.md" -e "lib/elixir/pages/Writing Documentation.md") + $(call COMPILE_DOCS,Elixir,elixir,Kernel,-e "lib/elixir/pages/Behaviours.md" -e "lib/elixir/pages/Deprecations.md" -e "lib/elixir/pages/Guards.md" -e "lib/elixir/pages/Naming Conventions.md" -e "lib/elixir/pages/Operators.md" -e "lib/elixir/pages/Syntax Reference.md" -e "lib/elixir/pages/Typespecs.md" -e "lib/elixir/pages/Unicode Syntax.md" -e "lib/elixir/pages/Writing Documentation.md") docs_eex: compile ../ex_doc/bin/ex_doc @ echo "==> ex_doc (eex)" diff --git a/lib/elixir/pages/Unicode Syntax.md b/lib/elixir/pages/Unicode Syntax.md new file mode 100644 index 000000000..ff5e7496e --- /dev/null +++ b/lib/elixir/pages/Unicode Syntax.md @@ -0,0 +1,58 @@ +# Unicode Syntax + +Elixir implements [Unicode Annex #31](http://unicode.org/reports/tr31/) for non-quoted atoms and variables as specified in this document. + +## Version + +To check the Unicode version of your current Elixir installation please run `String.Unicode.version()`. + +The changes in this document were included to Elixir v1.5 and require OTP 20+. + +## R1. Default Identifiers + +Elixir identifiers are identified as: + + <Identifier> := <Start> <Continue>* <Ending>? + +where `<Start>` is: + +> characters derived from the Unicode General Category of uppercase letters, lowercase letters, titlecase letters, modifier letters, other letters, letter numbers, plus Other_ID_Start, minus Pattern_Syntax and Pattern_White_Space code points +> +> In set notation: [[:L:][:Nl:][:Other_ID_Start:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]] + +and `<Continue>` is: + +> ID_Start characters, plus characters having the Unicode General Category of nonspacing marks, spacing combining marks, decimal number, connector punctuation, plus Other_ID_Continue, minus Pattern_Syntax and Pattern_White_Space code points. +> +> In set notation: [[:ID_Start:][:Mn:][:Mc:][:Nd:][:Pc:][:Other_ID_Continue:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]] + +`<Ending>` is an addition specific to Elixir that includes the codepoints ? (003F) and ! (0021). + +Elixir does not implement requirement R1a. It does implement requirement R1b. + +### Atoms + +Atoms in Elixir follow the identifier rule above with the following modifications: + + * `<Start>` includes the codepoint _ (005F) + * `<Continue>` includes the codepoint @ (0040) + +### Variables + +Atoms in Elixir follow the identifier rule above with the following modifications: + + * `<Start>` includes the codepoint _ (005F) + * `<Start>` must not include Lu (letter upcase) and Lt (letter titlecase) characters + * `<Continue>` includes Lu (letter upcase) and Lt (letter titlecase) characters + +## R6. Filtered Normalized Identifiers + +Identifiers in Elixir are case sensitive. + +Elixir requires all atoms and variables to be in NFC form. Any other form will fail with a relevant error message. Quoted-atoms and variables can, however, be in any form and are not verified by the parser. + +In other words, the atom `:josé` can only be written with the codepoints 006A 006F 0073 00E9. On the other hand, `:"josé"` may be written as 006A 006F 0073 00E9 or 006A 006F 0073 0065 0301. + +## Other considerations + +It is worth noting that Elixir supports only codepoints \t (0009), \n (000A), \r (000D) and \s (0020) as whitespace and therefore does not follow requirement R3. |