bzrlib/_knit_load_data_pyx.pyx


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304

# Copyright (C) 2007-2010 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

"""Pyrex extensions to knit parsing."""

import sys

from bzrlib import errors


cdef extern from "stdlib.h":
    ctypedef unsigned size_t
    long int strtol(char *nptr, char **endptr, int base)


cdef extern from "Python.h":
    int PyDict_CheckExact(object)
    void *PyDict_GetItem_void "PyDict_GetItem" (object p, object key)
    int PyDict_SetItem(object p, object key, object val) except -1

    int PyList_Append(object lst, object item) except -1
    object PyList_GET_ITEM(object lst, int index)
    int PyList_CheckExact(object)

    void *PyTuple_GetItem_void_void "PyTuple_GET_ITEM" (void* tpl, int index)

    char *PyString_AsString(object p)
    object PyString_FromStringAndSize(char *, int)
    int PyString_Size(object p)

    void Py_INCREF(object)


cdef extern from "string.h":
    void *memchr(void *s, int c, size_t n)


cdef int string_to_int_safe(char *s, char *end, int *out) except -1:
    """Convert a base10 string to an integer.

    This makes sure the whole string is consumed, or it raises ValueError.
    This is similar to how int(s) works, except you don't need a Python
    String object.

    :param s: The string to convert
    :param end: The character after the integer. So if the string is '12\0',
        this should be pointing at the '\0'. If the string was '12 ' then this
        should point at the ' '.
    :param out: This is the integer that will be returned
    :return: -1 if an exception is raised. 0 otherwise
    """
    cdef char *integer_end

    # We can't just return the integer because of how pyrex determines when
    # there is an exception.
    out[0] = <int>strtol(s, &integer_end, 10)
    if integer_end != end:
        py_s = PyString_FromStringAndSize(s, end-s)
        raise ValueError('%r is not a valid integer' % (py_s,))
    return 0


cdef class KnitIndexReader:

    cdef object kndx
    cdef object fp

    cdef object cache
    cdef object history

    cdef char * cur_str
    cdef char * end_str

    cdef int history_len

    def __init__(self, kndx, fp):
        self.kndx = kndx
        self.fp = fp

        self.cache = kndx._cache
        self.history = kndx._history

        self.cur_str = NULL
        self.end_str = NULL
        self.history_len = 0

    cdef int validate(self) except -1:
        if not PyDict_CheckExact(self.cache):
            raise TypeError('kndx._cache must be a python dict')
        if not PyList_CheckExact(self.history):
            raise TypeError('kndx._history must be a python list')
        return 0

    cdef object process_options(self, char *option_str, char *end):
        """Process the options string into a list."""
        cdef char *next

        # This is alternative code which creates a python string and splits it.
        # It is "correct" and more obvious, but slower than the following code.
        # It can be uncommented to switch in case the other code is seen as
        # suspect.
        # options = PyString_FromStringAndSize(option_str,
        #                                      end - option_str)
        # return options.split(',')

        final_options = []

        while option_str < end:
            next = <char*>memchr(option_str, c',', end - option_str)
            if next == NULL:
                next = end
            next_option = PyString_FromStringAndSize(option_str,
                                                     next - option_str)
            PyList_Append(final_options, next_option)

            # Move past the ','
            option_str = next+1

        return final_options

    cdef object process_parents(self, char *parent_str, char *end):
        cdef char *next
        cdef int int_parent
        cdef char *parent_end

        # Alternative, correct but slower code.
        #
        # parents = PyString_FromStringAndSize(parent_str,
        #                                      end - parent_str)
        # real_parents = []
        # for parent in parents.split():
        #     if parent[0].startswith('.'):
        #         real_parents.append(parent[1:])
        #     else:
        #         real_parents.append(self.history[int(parent)])
        # return real_parents

        parents = []
        while parent_str <= end:
            next = <char*>memchr(parent_str, c' ', end - parent_str)
            if next == NULL or next >= end or next == parent_str:
                break

            if parent_str[0] == c'.':
                # This is an explicit revision id
                parent_str = parent_str + 1
                parent = PyString_FromStringAndSize(parent_str,
                                                    next - parent_str)
            else:
                # This in an integer mapping to original
                string_to_int_safe(parent_str, next, &int_parent)

                if int_parent >= self.history_len:
                    raise IndexError('Parent index refers to a revision which'
                        ' does not exist yet.'
                        ' %d > %d' % (int_parent, self.history_len))
                parent = PyList_GET_ITEM(self.history, int_parent)
                # PyList_GET_ITEM steals a reference
                Py_INCREF(parent)
            PyList_Append(parents, parent)
            parent_str = next + 1
        return tuple(parents)

    cdef int process_one_record(self, char *start, char *end) except -1:
        """Take a simple string and split it into an index record."""
        cdef char *version_id_str
        cdef int version_id_size
        cdef char *option_str
        cdef char *option_end
        cdef char *pos_str
        cdef int pos
        cdef char *size_str
        cdef int size
        cdef char *parent_str
        cdef int parent_size
        cdef void *cache_entry

        version_id_str = start
        option_str = <char*>memchr(version_id_str, c' ', end - version_id_str)
        if option_str == NULL or option_str >= end:
            # Short entry
            return 0
        version_id_size = <int>(option_str - version_id_str)
        # Move past the space character
        option_str = option_str + 1

        pos_str = <char*>memchr(option_str, c' ', end - option_str)
        if pos_str == NULL or pos_str >= end:
            # Short entry
            return 0
        option_end = pos_str
        pos_str = pos_str + 1

        size_str = <char*>memchr(pos_str, c' ', end - pos_str)
        if size_str == NULL or size_str >= end:
            # Short entry
            return 0
        size_str = size_str + 1

        parent_str = <char*>memchr(size_str, c' ', end - size_str)
        if parent_str == NULL or parent_str >= end:
            # Missing parents
            return 0
        parent_str = parent_str + 1

        version_id = PyString_FromStringAndSize(version_id_str,
                                                version_id_size)
        options = self.process_options(option_str, option_end)

        try:
            string_to_int_safe(pos_str, size_str - 1, &pos)
            string_to_int_safe(size_str, parent_str - 1, &size)
            parents = self.process_parents(parent_str, end)
        except (ValueError, IndexError), e:
            py_line = PyString_FromStringAndSize(start, end - start)
            raise errors.KnitCorrupt(self.kndx._filename,
                                     "line %r: %s" % (py_line, e))

        cache_entry = PyDict_GetItem_void(self.cache, version_id)
        if cache_entry == NULL:
            PyList_Append(self.history, version_id)
            index = self.history_len
            self.history_len = self.history_len + 1
        else:
            # PyTuple_GetItem_void_void does *not* increment the reference
            # counter, but casting to <object> does.
            index = <object>PyTuple_GetItem_void_void(cache_entry, 5)

        PyDict_SetItem(self.cache, version_id,
                       (version_id,
                        options,
                        pos,
                        size,
                        parents,
                        index,
                       ))
        return 1

    cdef int process_next_record(self) except -1:
        """Process the next record in the file."""
        cdef char *last
        cdef char *start

        start = self.cur_str
        # Find the next newline
        last = <char*>memchr(start, c'\n', self.end_str - start)
        if last == NULL:
            # Process until the end of the file
            last = self.end_str - 1
            self.cur_str = self.end_str
        else:
            # The last character is right before the '\n'
            # And the next string is right after it
            self.cur_str = last + 1
            last = last - 1

        if last <= start or last[0] != c':':
            # Incomplete record
            return 0

        return self.process_one_record(start, last)

    def read(self):
        cdef int text_size

        self.validate()

        self.kndx.check_header(self.fp)

        # We read the whole thing at once
        # TODO: jam 2007-05-09 Consider reading incrementally rather than
        #       having to have the whole thing read up front.
        #       we already know that calling f.readlines() versus lots of
        #       f.readline() calls is faster.
        #       The other possibility is to avoid a Python String here
        #       completely. However self.fp may be a 'file-like' object
        #       it is not guaranteed to be a real file.
        text = self.fp.read()
        text_size = PyString_Size(text)
        self.cur_str = PyString_AsString(text)
        # This points to the last character in the string
        self.end_str = self.cur_str + text_size

        while self.cur_str < self.end_str:
            self.process_next_record()


def _load_data_c(kndx, fp):
    """Load the knit index file into memory."""
    reader = KnitIndexReader(kndx, fp)
    reader.read()