From 62932e0f74102d3c65a0e3e6e2de3673e64741d8 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Wed, 27 Mar 2013 11:19:54 -0500 Subject: encoding: use uninterpreted bytes whenever possible Git is encoding-agnostic in the sense that it interprets file contents, commit messages and paths as binary. In the case of paths, this means that the non-NUL bytes returned from readdir(2) are stored and later passed to lstat(2) and creat(2). See git-commit(1) for details. To be compatible with Git's mode of operation, we also use raw bytes whenever possible. hashlib's hexdigest returns Python 'str', which we immediately encode as ASCII so that it can be used with path component and cleaned bytes to be committed. Renamed variable 'bytes' to 'bytecount' due to conflict with type Includes contributions from: Stephen Miller --- git-fat | 105 +++++++++++++++++++++++++++++++++------------------------------- 1 file changed, 54 insertions(+), 51 deletions(-) diff --git a/git-fat b/git-fat index aa90dbf..b40b32e 100755 --- a/git-fat +++ b/git-fat @@ -42,10 +42,10 @@ def mkdir_p(path): else: raise def readblocks(stream): - bytes = 0 + bytecount = 0 while True: data = stream.read(BLOCK_SIZE) - bytes += len(data) + bytecount += len(data) if not data: break yield data @@ -101,19 +101,19 @@ class GitFat(object): self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore self.gitroot = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip() self.gitdir = subprocess.check_output('git rev-parse --git-dir'.split()).strip() - self.objdir = os.path.join(self.gitdir, 'fat', 'objects') + self.objdir = os.path.join(self.gitdir, b'fat', b'objects') if os.environ.get('GIT_FAT_VERSION') == '1': self.encode = self.encode_v1 else: self.encode = self.encode_v2 def magiclen(enc): - return len(enc(hashlib.sha1('dummy').hexdigest(), 5)) + return len(enc(hashlib.sha1(b'dummy').hexdigest().encode('ASCII'), 5)) self.magiclen = magiclen(self.encode) # Current version self.magiclens = [magiclen(enc) for enc in [self.encode_v1, self.encode_v2]] # All prior versions def setup(self): mkdir_p(self.objdir) def get_rsync(self): - cfgpath = os.path.join(self.gitroot,'.gitfat') + cfgpath = os.path.join(self.gitroot, b'.gitfat') remote = gitconfig_get('rsync.remote', file=cfgpath) ssh_port = gitconfig_get('rsync.sshport', file=cfgpath) ssh_user = gitconfig_get('rsync.sshuser', file=cfgpath) @@ -127,38 +127,38 @@ class GitFat(object): else: self.verbose('Pulling from %s' % (remote)) - cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-'] - rshopts = '' + cmd = [b'rsync', b'--progress', b'--ignore-existing', b'--from0', b'--files-from=-'] + rshopts = b'' if ssh_user: - rshopts += ' -l ' + ssh_user + rshopts += b' -l ' + ssh_user if ssh_port: - rshopts += ' -p ' + ssh_port + rshopts += b' -p ' + ssh_port if rshopts: - cmd.append('--rsh=ssh' + rshopts) + cmd.append(b'--rsh=ssh' + rshopts) if push: - cmd += [self.objdir + '/', remote + '/'] + cmd += [self.objdir + b'/', remote + b'/'] else: - cmd += [remote + '/', self.objdir + '/'] + cmd += [remote + b'/', self.objdir + b'/'] return cmd def revparse(self, revname): return subprocess.check_output(['git', 'rev-parse', revname]).strip() - def encode_v1(self, digest, bytes): + def encode_v1(self, digest, bytecount): 'Produce legacy representation of file to be stored in repository.' - return '#$# git-fat %s\n' % (digest,) - def encode_v2(self, digest, bytes): + return (b'#$# git-fat ' + digest + b'\n') + def encode_v2(self, digest, bytecount): 'Produce representation of file to be stored in repository. 20 characters can hold 64-bit integers.' - return '#$# git-fat %s %20d\n' % (digest, bytes) - def decode(self, string, noraise=False): - cookie = '#$# git-fat ' - if string.startswith(cookie): - parts = string[len(cookie):].split() + return (b'#$# git-fat ' + digest + (' %20d\n' % (bytecount,)).encode('ASCII')) + def decode(self, bstring, noraise=False): + cookie = b'#$# git-fat ' + if bstring.startswith(cookie): + parts = bstring[len(cookie):].split() digest = parts[0] - bytes = int(parts[1]) if len(parts) > 1 else None - return digest, bytes + bytecount = int(parts[1]) if len(parts) > 1 else None + return digest, int(bytecount) elif noraise: return None, None else: - raise GitFat.DecodeError('Could not decode %s' % (string)) + raise GitFat.DecodeError('Could not decode %s' % repr(bstring)) def decode_stream(self, stream): 'Return digest if git-fat cache, otherwise return iterator over entire file contents' preamble = stream.read(self.magiclen) @@ -173,11 +173,11 @@ class GitFat(object): if stat.st_size != self.magiclen: return False, None # read file - digest, bytes = self.decode_stream(open(fname, 'rb')) - if isinstance(digest, str): - return digest, bytes + digest, bytecount = self.decode_stream(open(fname, 'rb')) + if isinstance(digest, bytes): + return digest, bytecount else: - return None, bytes + return None, bytecount def decode_clean(self, body): ''' Attempt to decode version in working tree. The tree version could be changed to have a more @@ -185,16 +185,17 @@ class GitFat(object): version decodes successfully, it indicates that the fat data is not currently available in this repository. ''' - digest, bytes = self.decode(body, noraise=True) + digest, bytecount = self.decode(body, noraise=True) return digest def filter_clean(self, instream, outstreamclean): h = hashlib.new('sha1') - bytes = 0 - fd, tmpname = tempfile.mkstemp(dir=self.objdir) + bytecount = 0 + # mkstemp requires 'str' rather than native filesystem bytes + fd, tmpname = tempfile.mkstemp(dir=self.objdir.decode(sys.getfilesystemencoding())) try: ishanging = False cached = False # changes to True when file is cached - with os.fdopen(fd, 'w') as cache: + with os.fdopen(fd, 'wb') as cache: outstream = cache blockiter = readblocks(instream) firstblock = True @@ -205,10 +206,10 @@ class GitFat(object): outstream = outstreamclean firstblock = False h.update(block) - bytes += len(block) + bytecount += len(block) outstream.write(block) outstream.flush() - digest = h.hexdigest() + digest = h.hexdigest().encode('ASCII') objfile = os.path.join(self.objdir, digest) if not ishanging: if os.path.exists(objfile): @@ -218,7 +219,7 @@ class GitFat(object): os.rename(tmpname, objfile) self.verbose('git-fat filter-clean: caching to %s' % objfile) cached = True - outstreamclean.write(self.encode(digest, bytes)) + outstreamclean.write(self.encode(digest, bytecount)) finally: if not cached: os.remove(tmpname) @@ -239,16 +240,16 @@ class GitFat(object): # Ensure streams are treated as binary sys.stdin = ensure_binary_mode(sys.stdin) sys.stdout = ensure_binary_mode(sys.stdout) - result, bytes = self.decode_stream(sys.stdin) - if isinstance(result, str): # We got a digest + result, bytecount = self.decode_stream(sys.stdin) + if isinstance(result, bytes): # We got a digest objfile = os.path.join(self.objdir, result) try: - cat(open(objfile), sys.stdout) self.verbose('git-fat filter-smudge: restoring from %s' % objfile) - except IOError: # file not found + cat(open(objfile, 'rb'), sys.stdout) + except IOError: # file not found self.verbose('git-fat filter-smudge: fat object missing %s' % objfile) - sys.stdout.write(self.encode(result, bytes)) # could leave a better notice about how to recover this file - else: # We have an iterable over the original input. + sys.stdout.write(self.encode(result, bytecount)) # could leave a better notice about how to recover this file + else: # We have an iterable over the original input. self.verbose('git-fat filter-smudge: not a managed file') cat_iter(result, sys.stdout) def catalog_objects(self): @@ -263,13 +264,13 @@ class GitFat(object): p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) def cut_sha1hash(input, output): for line in input: - output.write(line.split()[0] + '\n') + output.write(line.split()[0] + b'\n') output.close() cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin)) cut_thread.start() for line in p2.stdout: objhash, objtype, size = line.split() - if objtype == 'blob' and int(size) in self.magiclens: + if objtype == b'blob' and int(size) in self.magiclens: try: fathash = self.decode(subprocess.check_output(['git', 'cat-file', '-p', objhash]))[0] referenced.add(fathash) @@ -315,9 +316,9 @@ class GitFat(object): pushall = '--all' in args files = self.referenced_objects(all=pushall) & self.catalog_objects() cmd = self.get_rsync_command(push=True) - self.verbose('Executing: %s' % ' '.join(cmd)) + self.verbose('Executing: %s' % b' '.join(cmd)) p = subprocess.Popen(cmd, stdin=subprocess.PIPE) - p.communicate(input='\x00'.join(files)) + p.communicate(input=b'\x00'.join(files)) def checkout(self, show_orphans=False): 'Update any stale files in the present working tree' for digest, fname in self.orphan_files(): @@ -349,9 +350,9 @@ class GitFat(object): refargs['rev'] = rev files = self.referenced_objects(**refargs) - self.catalog_objects() cmd = self.get_rsync_command(push=False) - self.verbose('Executing: %s' % ' '.join(cmd)) + self.verbose('Executing: %s' % b' '.join(cmd)) p = subprocess.Popen(cmd, stdin=subprocess.PIPE) - p.communicate(input='\x00'.join(files)) + p.communicate(input=(b'\x00'.join(files))) self.checkout() def cmd_checkout(self, args): self.checkout(show_orphans=True) @@ -401,6 +402,7 @@ class GitFat(object): time1 = time.time() self.verbose('%d of %d blobs are >= %d bytes [elapsed %.3fs]' % (numlarge, numblobs, threshsize, time1-time0)) def cmd_find(self, args): + # FIXME: Need input validation here maxsize = int(args[0]) blobsizes = dict(self.gen_large_blobs('--all', maxsize)) time0 = time.time() @@ -421,19 +423,20 @@ class GitFat(object): revlist.wait() difftree.wait() def cmd_index_filter(self, args): + # FIXME: Need input validation here manage_gitattributes = '--manage-gitattributes' in args filelist = set(f.strip() for f in open(args[0]).readlines()) lsfiles = subprocess.Popen(['git', 'ls-files', '-s'], stdout=subprocess.PIPE) updateindex = subprocess.Popen(['git', 'update-index', '--index-info'], stdin=subprocess.PIPE) - for line in lsfiles.stdout: - mode, sep, tail = line.partition(' ') - blobhash, sep, tail = tail.partition(' ') - stageno, sep, tail = tail.partition('\t') + for line in lsfiles.stdout.read(): + mode, sep, tail = line.partition(b' ') + blobhash, sep, tail = tail.partition(b' ') + stageno, sep, tail = tail.partition(b'\t') filename = tail.strip() if filename not in filelist: continue # This file will contain the hash of the cleaned object - hashfile = os.path.join(self.gitdir, 'fat', 'index-filter', blobhash) + hashfile = os.path.join(self.gitdir, b'fat', b'index-filter', blobhash) try: cleanedobj = open(hashfile).read().rstrip() except IOError: -- cgit v1.2.1