Fix Unicode object -> chararray conversion on narrow Python builds

author: mdroe <mdroe@localhost> 2009-10-14 15:01:41 +0000
committer: mdroe <mdroe@localhost> 2009-10-14 15:01:41 +0000
commit: fbbf05cfefe98fd284c08c2f3a78c7cf5503821a (patch)
tree: 87da915d7d0e3f96b5ba1b072563b44c31e6545a /numpy/core
parent: c4db9cf34c643bd422060de6b636f53a57557c4d (diff)
download: numpy-fbbf05cfefe98fd284c08c2f3a78c7cf5503821a.tar.gz
4 files changed, 48 insertions, 9 deletions
diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py
index 45b061d69..4ad5c59d3 100644
--- a/numpy/core/defchararray.py
+++ b/numpy/core/defchararray.py
@@ -1714,6 +1714,7 @@ class chararray(ndarray):
             self = ndarray.__new__(subtype, shape, (dtype, itemsize),
                                    order=order)
         else:
+            print shape, dtype, itemsize
             self = ndarray.__new__(subtype, shape, (dtype, itemsize),
                                    buffer=buffer,
                                    offset=offset, strides=strides,
@@ -2422,9 +2423,40 @@ def array(obj, itemsize=None, copy=True, unicode=None, order=None):
                 unicode = True
             else:
                 unicode = False
+
         if itemsize is None:
             itemsize = _len(obj)
         shape = _len(obj) / itemsize
+
+        if unicode:
+            if sys.maxunicode == 0xffff:
+                # On a narrow Python build, the buffer for Unicode
+                # strings is UCS2, which doesn't match the buffer for
+                # Numpy Unicode types, which is ALWAYS UCS4.
+                # Therefore, we need to convert the buffer.  On Python
+                # 2.6 and later, we can use the utf_32 codec.  Earlier
+                # versions don't have that codec, so we convert to a
+                # numerical array that matches the input buffer, and
+                # then use Numpy to convert it to UCS4.  All of this
+                # should happen in native endianness.
+                if sys.hexversion >= 0x2060000:
+                    obj = obj.encode('utf_32')
+                else:
+                    if isinstance(obj, str):
+                        ascii = numpy.frombuffer(obj, 'u1')
+                        ucs4 = numpy.array(ascii, 'u4')
+                        obj = ucs4.data
+                    else:
+                        ucs2 = numpy.frombuffer(obj, 'u2')
+                        ucs4 = numpy.array(ucs2, 'u4')
+                        obj = ucs4.data
+            else:
+                obj = unicode(obj)
+        else:
+            # Let the default Unicode -> string encoding (if any) take
+            # precedence.
+            obj = str(obj)
+
         return chararray(shape, itemsize=itemsize, unicode=unicode,
                          buffer=obj, order=order)
 
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 9cff6836e..b2b73d7be 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -1816,9 +1816,10 @@ OBJECT_copyswapn (PyObject **dst, intp dstride, PyObject **src, intp sstride,
 {
     intp i;
     if (src != NULL) {
-        dstride /= sizeof(PyObject **);
-        sstride /= sizeof(PyObject **);
-        if (__ALIGNED(dst,sizeof(PyObject **)) && __ALIGNED(src, sizeof(PyObject **))) {
+        if (__ALIGNED(dst,sizeof(PyObject **)) && __ALIGNED(src, sizeof(PyObject **)) &&
+            __ALIGNED(dstride,sizeof(PyObject **)) && __ALIGNED(sstride,sizeof(PyObject*))) {
+            dstride /= sizeof(PyObject **);
+            sstride /= sizeof(PyObject **);
             for (i=0; i<n; i++) {
                 Py_XINCREF(*src);
                 Py_XDECREF(*dst);
@@ -1828,10 +1829,13 @@ OBJECT_copyswapn (PyObject **dst, intp dstride, PyObject **src, intp sstride,
             }
         }
         else {
+            unsigned char *dstp, *srcp;
             PyObject **dp, **sp;
+            dstp = (unsigned char*)dst;
+            srcp = (unsigned char*)src;
             for (i=0; i<n; i++) {
-                dp = dst;
-                sp = src;
+                dp = (PyObject **)dstp;
+                sp = (PyObject **)srcp;
                 Py_XINCREF(*sp);
                 Py_XDECREF(*dp);
                 memcpy(dst, src, sizeof(PyObject *));
diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c
index b85cf937d..bf69b8953 100644
--- a/numpy/core/src/multiarray/convert_datatype.c
+++ b/numpy/core/src/multiarray/convert_datatype.c
@@ -126,24 +126,25 @@ PyArray_GetCastFunc(PyArray_Descr *descr, int type_num)
  * buffers[1] is the source
  */
 static void
-_strided_buffered_cast(char *dptr, intp dstride, int delsize, int dswap,
+_strided_buffered_cast(char *dptr, intp dstride, intp delsize, int dswap,
                        PyArray_CopySwapNFunc *dcopyfunc,
-                       char *sptr, intp sstride, int selsize, int sswap,
+                       char *sptr, intp sstride, intp selsize, int sswap,
                        PyArray_CopySwapNFunc *scopyfunc,
                        intp N, char **buffers, int bufsize,
                        PyArray_VectorUnaryFunc *castfunc,
                        PyArrayObject *dest, PyArrayObject *src)
 {
     int i;
+
     if (N <= bufsize) {
         /*
          * 1. copy input to buffer and swap
          * 2. cast input to output
          * 3. swap output if necessary and copy from output buffer
          */
-        scopyfunc(buffers[1], selsize, sptr, sstride, N, sswap, src);
+        scopyfunc((void *)buffers[1], selsize, sptr, sstride, N, sswap, src);
         castfunc(buffers[1], buffers[0], N, src, dest);
-        dcopyfunc(dptr, dstride, buffers[0], delsize, N, dswap, dest);
+        dcopyfunc(dptr, dstride, (void *)buffers[0], delsize, N, dswap, dest);
         return;
     }
 
diff --git a/numpy/core/tests/test_defchararray.py b/numpy/core/tests/test_defchararray.py
index fa3fe982f..a2e04b632 100644
--- a/numpy/core/tests/test_defchararray.py
+++ b/numpy/core/tests/test_defchararray.py
@@ -66,6 +66,8 @@ class TestBasic(TestCase):
 
     def test_from_unicode(self):
         A = np.char.array(u'\u03a3')
+        print A
+        print repr(A)
         assert_equal(len(A), 1)
         assert_equal(len(A[0]), 1)
         assert_equal(A.itemsize, 4)
author	mdroe <mdroe@localhost>	2009-10-14 15:01:41 +0000
committer	mdroe <mdroe@localhost>	2009-10-14 15:01:41 +0000
commit	fbbf05cfefe98fd284c08c2f3a78c7cf5503821a (patch)
tree	87da915d7d0e3f96b5ba1b072563b44c31e6545a /numpy/core
parent	c4db9cf34c643bd422060de6b636f53a57557c4d (diff)
download	numpy-fbbf05cfefe98fd284c08c2f3a78c7cf5503821a.tar.gz