diff options
author | Sybren A. Stüvel <sybren@stuvel.eu> | 2011-07-31 20:47:49 +0200 |
---|---|---|
committer | Sybren A. Stüvel <sybren@stuvel.eu> | 2011-07-31 20:47:49 +0200 |
commit | 3200f895745919a6074505dd40c8d0f88b01836a (patch) | |
tree | 864f5a2722340061553407cdbcb33972cf44d7bc /rsa/varblock.py | |
parent | fa345b65cf1951d8177fa9f4f90c06e1249e5f6f (diff) | |
download | rsa-git-3200f895745919a6074505dd40c8d0f88b01836a.tar.gz |
Made hashing efficient for large files
Diffstat (limited to 'rsa/varblock.py')
-rw-r--r-- | rsa/varblock.py | 149 |
1 files changed, 149 insertions, 0 deletions
diff --git a/rsa/varblock.py b/rsa/varblock.py new file mode 100644 index 0000000..b8bd899 --- /dev/null +++ b/rsa/varblock.py @@ -0,0 +1,149 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2011 Sybren A. Stüvel <sybren@stuvel.eu> +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +'''VARBLOCK file support + +The VARBLOCK file format is as follows, where || denotes byte concatenation: + + FILE := VERSION || BLOCK || BLOCK ... + + BLOCK := LENGTH || DATA + + LENGTH := varint-encoded length of the subsequent data. Varint comes from + Google Protobuf, and encodes an integer into a variable number of bytes. + Each byte uses the 7 lowest bits to encode the value. The highest bit set + to 1 indicates the next byte is also part of the varint. The last byte will + have this bit set to 0. + +This file format is called the VARBLOCK format, in line with the varint format +used to denote the block sizes. + +''' + +VARBLOCK_VERSION = 1 + +def read_varint(infile): + '''Reads a varint from the file. + + When the first byte to be read indicates EOF, (0, 0) is returned. When an + EOF occurs when at least one byte has been read, an EOFError exception is + raised. + + @param infile: the file-like object to read from. It should have a read() + method. + @returns (varint, length), the read varint and the number of read bytes. + ''' + + varint = 0 + read_bytes = 0 + + while True: + char = infile.read(1) + if len(char) == 0: + if read_bytes == 0: + return (0, 0) + raise EOFError('EOF while reading varint, value is %i so far' % + varint) + + byte = ord(char) + varint += (byte & 0x7F) << (7 * read_bytes) + + read_bytes += 1 + + if not byte & 0x80: + return (varint, read_bytes) + +def write_varint(outfile, value): + '''Writes a varint to a file. + + @param outfile: the file-like object to write to. It should have a write() + method. + @returns the number of written bytes. + ''' + + # there is a big difference between 'write the value 0' (this case) and + # 'there is nothing left to write' (the false-case of the while loop) + + if value == 0: + outfile.write('\x00') + return 1 + + written_bytes = 0 + while value > 0: + to_write = value & 0x7f + value = value >> 7 + + if value > 0: + to_write |= 0x80 + + outfile.write(chr(to_write)) + written_bytes += 1 + + return written_bytes + + +def yield_varblocks(infile): + '''Generator, yields each block in the input file. + + @param infile: file to read, is expected to have the VARBLOCK format as + described in the module's docstring. + @yields the contents of each block. + ''' + + # Check the version number + first_char = infile.read(1) + if len(first_char) == 0: + raise EOFError('Unable to read VARBLOCK version number') + + version = ord(first_char) + if version != VARBLOCK_VERSION: + raise ValueError('VARBLOCK version %i not supported' % version) + + while True: + (block_size, read_bytes) = read_varint(infile) + + # EOF at block boundary, that's fine. + if read_bytes == 0 and block_size == 0: + break + + block = infile.read(block_size) + + read_size = len(block) + if read_size != block_size: + raise EOFError('Block size is %i, but could read only %i bytes' % + (block_size, read_size)) + + + yield block + +def yield_fixedblocks(infile, blocksize): + '''Generator, yields each block of ``blocksize`` bytes in the input file. + + :param infile: file to read and separate in blocks. + :returns: a generator that yields the contents of each block + ''' + + while True: + block = infile.read(blocksize) + + read_bytes = len(block) + if read_bytes == 0: + break + + yield block + + if read_bytes < blocksize: + break + |