From 62932e0f74102d3c65a0e3e6e2de3673e64741d8 Mon Sep 17 00:00:00 2001
From: Jed Brown <jed@59A2.org>
Date: Wed, 27 Mar 2013 11:19:54 -0500
Subject: encoding: use uninterpreted bytes whenever possible

Git is encoding-agnostic in the sense that it interprets file contents,
commit messages and paths as binary. In the case of paths, this means
that the non-NUL bytes returned from readdir(2) are stored and later
passed to lstat(2) and creat(2). See git-commit(1) for details.

To be compatible with Git's mode of operation, we also use raw bytes
whenever possible. hashlib's hexdigest returns Python 'str', which we
immediately encode as ASCII so that it can be used with path component
and cleaned bytes to be committed.

Renamed variable 'bytes' to 'bytecount' due to conflict with type

Includes contributions from: Stephen Miller <stephen.l.miller1@navy.mil>
---
 git-fat | 105 +++++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 54 insertions(+), 51 deletions(-)

diff --git a/git-fat b/git-fat
index aa90dbf..b40b32e 100755
--- a/git-fat
+++ b/git-fat
@@ -42,10 +42,10 @@ def mkdir_p(path):
         else: raise
 
 def readblocks(stream):
-    bytes = 0
+    bytecount = 0
     while True:
         data = stream.read(BLOCK_SIZE)
-        bytes += len(data)
+        bytecount += len(data)
         if not data:
             break
         yield data
@@ -101,19 +101,19 @@ class GitFat(object):
         self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore
         self.gitroot = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip()
         self.gitdir = subprocess.check_output('git rev-parse --git-dir'.split()).strip()
-        self.objdir = os.path.join(self.gitdir, 'fat', 'objects')
+        self.objdir = os.path.join(self.gitdir, b'fat', b'objects')
         if os.environ.get('GIT_FAT_VERSION') == '1':
             self.encode = self.encode_v1
         else:
             self.encode = self.encode_v2
         def magiclen(enc):
-            return len(enc(hashlib.sha1('dummy').hexdigest(), 5))
+            return len(enc(hashlib.sha1(b'dummy').hexdigest().encode('ASCII'), 5))
         self.magiclen = magiclen(self.encode) # Current version
         self.magiclens = [magiclen(enc) for enc in [self.encode_v1, self.encode_v2]] # All prior versions
     def setup(self):
         mkdir_p(self.objdir)
     def get_rsync(self):
-        cfgpath   = os.path.join(self.gitroot,'.gitfat')
+        cfgpath   = os.path.join(self.gitroot, b'.gitfat')
         remote    = gitconfig_get('rsync.remote', file=cfgpath)
         ssh_port  = gitconfig_get('rsync.sshport', file=cfgpath)
         ssh_user  = gitconfig_get('rsync.sshuser', file=cfgpath)
@@ -127,38 +127,38 @@ class GitFat(object):
         else:
             self.verbose('Pulling from %s' % (remote))
 
-        cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-']
-        rshopts = ''
+        cmd = [b'rsync', b'--progress', b'--ignore-existing', b'--from0', b'--files-from=-']
+        rshopts = b''
         if ssh_user:
-            rshopts += ' -l ' + ssh_user
+            rshopts += b' -l ' + ssh_user
         if ssh_port:
-            rshopts += ' -p ' + ssh_port
+            rshopts += b' -p ' + ssh_port
         if rshopts:
-            cmd.append('--rsh=ssh' + rshopts)
+            cmd.append(b'--rsh=ssh' + rshopts)
         if push:
-            cmd += [self.objdir + '/', remote + '/']
+            cmd += [self.objdir + b'/', remote + b'/']
         else:
-            cmd += [remote + '/', self.objdir + '/']
+            cmd += [remote + b'/', self.objdir + b'/']
         return cmd
     def revparse(self, revname):
         return subprocess.check_output(['git', 'rev-parse', revname]).strip()
-    def encode_v1(self, digest, bytes):
+    def encode_v1(self, digest, bytecount):
         'Produce legacy representation of file to be stored in repository.'
-        return '#$# git-fat %s\n' % (digest,)
-    def encode_v2(self, digest, bytes):
+        return (b'#$# git-fat ' + digest + b'\n')
+    def encode_v2(self, digest, bytecount):
         'Produce representation of file to be stored in repository. 20 characters can hold 64-bit integers.'
-        return '#$# git-fat %s %20d\n' % (digest, bytes)
-    def decode(self, string, noraise=False):
-        cookie = '#$# git-fat '
-        if string.startswith(cookie):
-            parts = string[len(cookie):].split()
+        return (b'#$# git-fat ' + digest + (' %20d\n' % (bytecount,)).encode('ASCII'))
+    def decode(self, bstring, noraise=False):
+        cookie = b'#$# git-fat '
+        if bstring.startswith(cookie):
+            parts = bstring[len(cookie):].split()
             digest = parts[0]
-            bytes = int(parts[1]) if len(parts) > 1 else None
-            return digest, bytes
+            bytecount = int(parts[1]) if len(parts) > 1 else None
+            return digest, int(bytecount)
         elif noraise:
             return None, None
         else:
-            raise GitFat.DecodeError('Could not decode %s' % (string))
+            raise GitFat.DecodeError('Could not decode %s' % repr(bstring))
     def decode_stream(self, stream):
         'Return digest if git-fat cache, otherwise return iterator over entire file contents'
         preamble = stream.read(self.magiclen)
@@ -173,11 +173,11 @@ class GitFat(object):
         if stat.st_size != self.magiclen:
             return False, None
         # read file
-        digest, bytes = self.decode_stream(open(fname, 'rb'))
-        if isinstance(digest, str):
-            return digest, bytes
+        digest, bytecount = self.decode_stream(open(fname, 'rb'))
+        if isinstance(digest, bytes):
+            return digest, bytecount
         else:
-            return None, bytes
+            return None, bytecount
     def decode_clean(self, body):
         '''
         Attempt to decode version in working tree. The tree version could be changed to have a more
@@ -185,16 +185,17 @@ class GitFat(object):
         version decodes successfully, it indicates that the fat data is not currently available in
         this repository.
         '''
-        digest, bytes = self.decode(body, noraise=True)
+        digest, bytecount = self.decode(body, noraise=True)
         return digest
     def filter_clean(self, instream, outstreamclean):
         h = hashlib.new('sha1')
-        bytes = 0
-        fd, tmpname = tempfile.mkstemp(dir=self.objdir)
+        bytecount = 0
+        # mkstemp requires 'str' rather than native filesystem bytes
+        fd, tmpname = tempfile.mkstemp(dir=self.objdir.decode(sys.getfilesystemencoding()))
         try:
             ishanging = False
             cached = False                # changes to True when file is cached
-            with os.fdopen(fd, 'w') as cache:
+            with os.fdopen(fd, 'wb') as cache:
                 outstream = cache
                 blockiter = readblocks(instream)
                 firstblock = True
@@ -205,10 +206,10 @@ class GitFat(object):
                             outstream = outstreamclean
                         firstblock = False
                     h.update(block)
-                    bytes += len(block)
+                    bytecount += len(block)
                     outstream.write(block)
                 outstream.flush()
-            digest = h.hexdigest()
+            digest = h.hexdigest().encode('ASCII')
             objfile = os.path.join(self.objdir, digest)
             if not ishanging:
                 if os.path.exists(objfile):
@@ -218,7 +219,7 @@ class GitFat(object):
                     os.rename(tmpname, objfile)
                     self.verbose('git-fat filter-clean: caching to %s' % objfile)
                 cached = True
-                outstreamclean.write(self.encode(digest, bytes))
+                outstreamclean.write(self.encode(digest, bytecount))
         finally:
             if not cached:
                 os.remove(tmpname)
@@ -239,16 +240,16 @@ class GitFat(object):
         # Ensure streams are treated as binary
         sys.stdin = ensure_binary_mode(sys.stdin)
         sys.stdout = ensure_binary_mode(sys.stdout)
-        result, bytes = self.decode_stream(sys.stdin)
-        if isinstance(result, str):       # We got a digest
+        result, bytecount = self.decode_stream(sys.stdin)
+        if isinstance(result, bytes): # We got a digest
             objfile = os.path.join(self.objdir, result)
             try:
-                cat(open(objfile), sys.stdout)
                 self.verbose('git-fat filter-smudge: restoring from %s' % objfile)
-            except IOError:                 # file not found
+                cat(open(objfile, 'rb'), sys.stdout)
+            except IOError:                # file not found
                 self.verbose('git-fat filter-smudge: fat object missing %s' % objfile)
-                sys.stdout.write(self.encode(result, bytes))   # could leave a better notice about how to recover this file
-        else:                             # We have an iterable over the original input.
+                sys.stdout.write(self.encode(result, bytecount))   # could leave a better notice about how to recover this file
+        else:                              # We have an iterable over the original input.
             self.verbose('git-fat filter-smudge: not a managed file')
             cat_iter(result, sys.stdout)
     def catalog_objects(self):
@@ -263,13 +264,13 @@ class GitFat(object):
         p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
         def cut_sha1hash(input, output):
             for line in input:
-                output.write(line.split()[0] + '\n')
+                output.write(line.split()[0] + b'\n')
             output.close()
         cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin))
         cut_thread.start()
         for line in p2.stdout:
             objhash, objtype, size = line.split()
-            if objtype == 'blob' and int(size) in self.magiclens:
+            if objtype == b'blob' and int(size) in self.magiclens:
                 try:
                     fathash = self.decode(subprocess.check_output(['git', 'cat-file', '-p', objhash]))[0]
                     referenced.add(fathash)
@@ -315,9 +316,9 @@ class GitFat(object):
         pushall = '--all' in args
         files = self.referenced_objects(all=pushall) & self.catalog_objects()
         cmd = self.get_rsync_command(push=True)
-        self.verbose('Executing: %s' % ' '.join(cmd))
+        self.verbose('Executing: %s' % b' '.join(cmd))
         p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
-        p.communicate(input='\x00'.join(files))
+        p.communicate(input=b'\x00'.join(files))
     def checkout(self, show_orphans=False):
         'Update any stale files in the present working tree'
         for digest, fname in self.orphan_files():
@@ -349,9 +350,9 @@ class GitFat(object):
                 refargs['rev'] = rev
         files = self.referenced_objects(**refargs) - self.catalog_objects()
         cmd = self.get_rsync_command(push=False)
-        self.verbose('Executing: %s' % ' '.join(cmd))
+        self.verbose('Executing: %s' % b' '.join(cmd))
         p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
-        p.communicate(input='\x00'.join(files))
+        p.communicate(input=(b'\x00'.join(files)))
         self.checkout()
     def cmd_checkout(self, args):
         self.checkout(show_orphans=True)
@@ -401,6 +402,7 @@ class GitFat(object):
         time1 = time.time()
         self.verbose('%d of %d blobs are >= %d bytes [elapsed %.3fs]' % (numlarge, numblobs, threshsize, time1-time0))
     def cmd_find(self, args):
+        # FIXME: Need input validation here
         maxsize = int(args[0])
         blobsizes = dict(self.gen_large_blobs('--all', maxsize))
         time0 = time.time()
@@ -421,19 +423,20 @@ class GitFat(object):
         revlist.wait()
         difftree.wait()
     def cmd_index_filter(self, args):
+        # FIXME: Need input validation here
         manage_gitattributes = '--manage-gitattributes' in args
         filelist = set(f.strip() for f in open(args[0]).readlines())
         lsfiles = subprocess.Popen(['git', 'ls-files', '-s'], stdout=subprocess.PIPE)
         updateindex = subprocess.Popen(['git', 'update-index', '--index-info'], stdin=subprocess.PIPE)
-        for line in lsfiles.stdout:
-            mode, sep, tail = line.partition(' ')
-            blobhash, sep, tail = tail.partition(' ')
-            stageno, sep, tail = tail.partition('\t')
+        for line in lsfiles.stdout.read():
+            mode, sep, tail = line.partition(b' ')
+            blobhash, sep, tail = tail.partition(b' ')
+            stageno, sep, tail = tail.partition(b'\t')
             filename = tail.strip()
             if filename not in filelist:
                 continue
             # This file will contain the hash of the cleaned object
-            hashfile = os.path.join(self.gitdir, 'fat', 'index-filter', blobhash)
+            hashfile = os.path.join(self.gitdir, b'fat', b'index-filter', blobhash)
             try:
                 cleanedobj = open(hashfile).read().rstrip()
             except IOError:
-- 
cgit v1.2.1