summaryrefslogtreecommitdiff
path: root/bzrlib/estimate_compressed_size.py
blob: 39262af8816eeb7d08a3745cc993e33870dc2533 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# Copyright (C) 2011 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

"""Code to estimate the entropy of content"""

from __future__ import absolute_import

import zlib


class ZLibEstimator(object):
    """Uses zlib.compressobj to estimate compressed size."""

    def __init__(self, target_size, min_compression=2.0):
        """Create a new estimator.

        :param target_size: The desired size of the compressed content.
        :param min_compression: Estimated minimum compression. By default we
            assume that the content is 'text', which means a min compression of
            about 2:1.
        """
        self._target_size = target_size
        self._compressor = zlib.compressobj()
        self._uncompressed_size_added = 0
        self._compressed_size_added = 0
        self._unflushed_size_added = 0
        self._estimated_compression = 2.0

    def add_content(self, content):
        self._uncompressed_size_added += len(content)
        self._unflushed_size_added += len(content)
        z_size = len(self._compressor.compress(content))
        if z_size > 0:
            self._record_z_len(z_size)

    def _record_z_len(self, count):
        # We got some compressed bytes, update the counters
        self._compressed_size_added += count
        self._unflushed_size_added = 0
        # So far we've read X uncompressed bytes, and written Y compressed
        # bytes. We should have a decent estimate of the final compression.
        self._estimated_compression = (float(self._uncompressed_size_added)
            / self._compressed_size_added)

    def full(self):
        """Have we reached the target size?"""
        if self._unflushed_size_added:
            remaining_size = self._target_size - self._compressed_size_added
            # Estimate how much compressed content the unflushed data will
            # consume
            est_z_size = (self._unflushed_size_added /
                          self._estimated_compression)
            if est_z_size >= remaining_size:
                # We estimate we are close to remaining
                z_size = len(self._compressor.flush(zlib.Z_SYNC_FLUSH))
                self._record_z_len(z_size)
        return self._compressed_size_added >= self._target_size