summaryrefslogtreecommitdiff
path: root/bzrlib/tests/EncodingAdapter.py
diff options
context:
space:
mode:
Diffstat (limited to 'bzrlib/tests/EncodingAdapter.py')
-rw-r--r--bzrlib/tests/EncodingAdapter.py133
1 files changed, 133 insertions, 0 deletions
diff --git a/bzrlib/tests/EncodingAdapter.py b/bzrlib/tests/EncodingAdapter.py
new file mode 100644
index 0000000..a5d4470
--- /dev/null
+++ b/bzrlib/tests/EncodingAdapter.py
@@ -0,0 +1,133 @@
+# Copyright (C) 2006, 2009, 2010, 2011 Canonical Ltd
+# -*- coding: utf-8 -*-
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+"""Adapter for running test cases against multiple encodings."""
+
+# prefix for micro (1/1000000)
+_mu = u'\xb5'
+
+# greek letter omega, not to be confused with
+# the Ohm sign, u'\u2126'. Though they are probably identical
+# cp437 can handle the first, but not the second
+_omega = u'\u03a9'
+
+# smallest error possible, epsilon
+# cp437 handles u03b5, but not u2208 the 'element of' operator
+_epsilon = u'\u03b5'
+
+# Swedish?
+_erik = u'Erik B\xe5gfors'
+
+# Swedish 'räksmörgås' means shrimp sandwich
+_shrimp_sandwich = u'r\xe4ksm\xf6rg\xe5s'
+
+# Arabic, probably only Unicode encodings can handle this one
+_juju = u'\u062c\u0648\u062c\u0648'
+
+# iso-8859-1 alternative for juju
+_juju_alt = u'j\xfbj\xfa'
+
+# Russian, 'Alexander' in russian
+_alexander = u'\u0410\u043b\u0435\u043a\u0441\u0430\u043d\u0434\u0440'
+# The word 'test' in Russian
+_russian_test = u'\u0422\u0435\u0441\u0442'
+
+# Kanji
+# It is a kanji sequence for nihonjin, or Japanese in English.
+#
+# '\u4eba' being person, 'u\65e5' sun and '\u672c' origin. Ie,
+# sun-origin-person, 'native from the land where the sun rises'. Note, I'm
+# not a fluent speaker, so this is just my crude breakdown.
+#
+# Wouter van Heyst
+_nihonjin = u'\u65e5\u672c\u4eba'
+
+# Czech
+# It's what is usually used for showing how fonts look, because it contains
+# most accented characters, ie. in places where Englishman use 'Quick brown fox
+# jumped over a lazy dog'. The literal translation of the Czech version would
+# be something like 'Yellow horse groaned devilish codes'. Actually originally
+# the last word used to be 'ódy' (odes). The 'k' was added as a pun when using
+# the sentece to check whether one has properly set encoding.
+_yellow_horse = (u'\u017dlu\u0165ou\u010dk\xfd k\u016f\u0148'
+ u' \xfap\u011bl \u010f\xe1belsk\xe9 k\xf3dy')
+_yellow = u'\u017dlu\u0165ou\u010dk\xfd'
+_someone = u'Some\u016f\u0148\u011b'
+_something = u'\u0165ou\u010dk\xfd'
+
+# Hebrew
+# Shalom -> 'hello' or 'peace', used as a common greeting
+_shalom = u'\u05e9\u05dc\u05d5\u05dd'
+
+
+encoding_scenarios = [
+ # Permutation 1 of utf-8
+ ('utf-8,1', {
+ 'info': {
+ 'committer': _erik,
+ 'message': _yellow_horse,
+ 'filename': _shrimp_sandwich,
+ 'directory': _nihonjin,
+ },
+ 'encoding': 'utf-8',
+ }),
+ # Permutation 2 of utf-8
+ ('utf-8,2', {
+ 'info': {
+ 'committer': _alexander,
+ 'message': u'Testing ' + _mu,
+ 'filename': _shalom,
+ 'directory': _juju,
+ },
+ 'encoding': 'utf-8',
+ }),
+ ('iso-8859-1', {
+ 'info': {
+ 'committer': _erik,
+ 'message': u'Testing ' + _mu,
+ 'filename': _juju_alt,
+ 'directory': _shrimp_sandwich,
+ },
+ 'encoding': 'iso-8859-1',
+ }),
+ ('iso-8859-2', {
+ 'info': {
+ 'committer': _someone,
+ 'message': _yellow_horse,
+ 'filename': _yellow,
+ 'directory': _something,
+ },
+ 'encoding': 'iso-8859-2',
+ }),
+ ('cp1251', {
+ 'info': {
+ 'committer': _alexander,
+ 'message': u'Testing ' + _mu,
+ 'filename': _russian_test,
+ 'directory': _russian_test + 'dir',
+ },
+ 'encoding': 'cp1251',
+ }),
+# The iso-8859-1 tests run on a default windows cp437 installation
+# and it takes a long time to run an extra permutation of the tests
+# But just in case we want to add this back in:
+# ('cp437', {'committer':_erik
+# , 'message':u'Testing ' + _mu
+# , 'filename':'file_' + _omega
+# , 'directory':_epsilon + '_dir',
+# 'encoding': 'cp437'}),
+ ]