summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJed Brown <jed@59A2.org>2013-03-27 11:19:54 -0500
committerJed Brown <jed@59A2.org>2013-03-27 13:44:18 -0500
commit62932e0f74102d3c65a0e3e6e2de3673e64741d8 (patch)
tree45de8e04347c69042c6f73ffea46865120ab4fd8
parent3534426e9b7afe78f670da36b22dc05b4b275139 (diff)
downloadgit-fat-62932e0f74102d3c65a0e3e6e2de3673e64741d8.tar.gz
encoding: use uninterpreted bytes whenever possible
Git is encoding-agnostic in the sense that it interprets file contents, commit messages and paths as binary. In the case of paths, this means that the non-NUL bytes returned from readdir(2) are stored and later passed to lstat(2) and creat(2). See git-commit(1) for details. To be compatible with Git's mode of operation, we also use raw bytes whenever possible. hashlib's hexdigest returns Python 'str', which we immediately encode as ASCII so that it can be used with path component and cleaned bytes to be committed. Renamed variable 'bytes' to 'bytecount' due to conflict with type Includes contributions from: Stephen Miller <stephen.l.miller1@navy.mil>
-rwxr-xr-xgit-fat105
1 files changed, 54 insertions, 51 deletions
diff --git a/git-fat b/git-fat
index aa90dbf..b40b32e 100755
--- a/git-fat
+++ b/git-fat
@@ -42,10 +42,10 @@ def mkdir_p(path):
else: raise
def readblocks(stream):
- bytes = 0
+ bytecount = 0
while True:
data = stream.read(BLOCK_SIZE)
- bytes += len(data)
+ bytecount += len(data)
if not data:
break
yield data
@@ -101,19 +101,19 @@ class GitFat(object):
self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore
self.gitroot = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip()
self.gitdir = subprocess.check_output('git rev-parse --git-dir'.split()).strip()
- self.objdir = os.path.join(self.gitdir, 'fat', 'objects')
+ self.objdir = os.path.join(self.gitdir, b'fat', b'objects')
if os.environ.get('GIT_FAT_VERSION') == '1':
self.encode = self.encode_v1
else:
self.encode = self.encode_v2
def magiclen(enc):
- return len(enc(hashlib.sha1('dummy').hexdigest(), 5))
+ return len(enc(hashlib.sha1(b'dummy').hexdigest().encode('ASCII'), 5))
self.magiclen = magiclen(self.encode) # Current version
self.magiclens = [magiclen(enc) for enc in [self.encode_v1, self.encode_v2]] # All prior versions
def setup(self):
mkdir_p(self.objdir)
def get_rsync(self):
- cfgpath = os.path.join(self.gitroot,'.gitfat')
+ cfgpath = os.path.join(self.gitroot, b'.gitfat')
remote = gitconfig_get('rsync.remote', file=cfgpath)
ssh_port = gitconfig_get('rsync.sshport', file=cfgpath)
ssh_user = gitconfig_get('rsync.sshuser', file=cfgpath)
@@ -127,38 +127,38 @@ class GitFat(object):
else:
self.verbose('Pulling from %s' % (remote))
- cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-']
- rshopts = ''
+ cmd = [b'rsync', b'--progress', b'--ignore-existing', b'--from0', b'--files-from=-']
+ rshopts = b''
if ssh_user:
- rshopts += ' -l ' + ssh_user
+ rshopts += b' -l ' + ssh_user
if ssh_port:
- rshopts += ' -p ' + ssh_port
+ rshopts += b' -p ' + ssh_port
if rshopts:
- cmd.append('--rsh=ssh' + rshopts)
+ cmd.append(b'--rsh=ssh' + rshopts)
if push:
- cmd += [self.objdir + '/', remote + '/']
+ cmd += [self.objdir + b'/', remote + b'/']
else:
- cmd += [remote + '/', self.objdir + '/']
+ cmd += [remote + b'/', self.objdir + b'/']
return cmd
def revparse(self, revname):
return subprocess.check_output(['git', 'rev-parse', revname]).strip()
- def encode_v1(self, digest, bytes):
+ def encode_v1(self, digest, bytecount):
'Produce legacy representation of file to be stored in repository.'
- return '#$# git-fat %s\n' % (digest,)
- def encode_v2(self, digest, bytes):
+ return (b'#$# git-fat ' + digest + b'\n')
+ def encode_v2(self, digest, bytecount):
'Produce representation of file to be stored in repository. 20 characters can hold 64-bit integers.'
- return '#$# git-fat %s %20d\n' % (digest, bytes)
- def decode(self, string, noraise=False):
- cookie = '#$# git-fat '
- if string.startswith(cookie):
- parts = string[len(cookie):].split()
+ return (b'#$# git-fat ' + digest + (' %20d\n' % (bytecount,)).encode('ASCII'))
+ def decode(self, bstring, noraise=False):
+ cookie = b'#$# git-fat '
+ if bstring.startswith(cookie):
+ parts = bstring[len(cookie):].split()
digest = parts[0]
- bytes = int(parts[1]) if len(parts) > 1 else None
- return digest, bytes
+ bytecount = int(parts[1]) if len(parts) > 1 else None
+ return digest, int(bytecount)
elif noraise:
return None, None
else:
- raise GitFat.DecodeError('Could not decode %s' % (string))
+ raise GitFat.DecodeError('Could not decode %s' % repr(bstring))
def decode_stream(self, stream):
'Return digest if git-fat cache, otherwise return iterator over entire file contents'
preamble = stream.read(self.magiclen)
@@ -173,11 +173,11 @@ class GitFat(object):
if stat.st_size != self.magiclen:
return False, None
# read file
- digest, bytes = self.decode_stream(open(fname, 'rb'))
- if isinstance(digest, str):
- return digest, bytes
+ digest, bytecount = self.decode_stream(open(fname, 'rb'))
+ if isinstance(digest, bytes):
+ return digest, bytecount
else:
- return None, bytes
+ return None, bytecount
def decode_clean(self, body):
'''
Attempt to decode version in working tree. The tree version could be changed to have a more
@@ -185,16 +185,17 @@ class GitFat(object):
version decodes successfully, it indicates that the fat data is not currently available in
this repository.
'''
- digest, bytes = self.decode(body, noraise=True)
+ digest, bytecount = self.decode(body, noraise=True)
return digest
def filter_clean(self, instream, outstreamclean):
h = hashlib.new('sha1')
- bytes = 0
- fd, tmpname = tempfile.mkstemp(dir=self.objdir)
+ bytecount = 0
+ # mkstemp requires 'str' rather than native filesystem bytes
+ fd, tmpname = tempfile.mkstemp(dir=self.objdir.decode(sys.getfilesystemencoding()))
try:
ishanging = False
cached = False # changes to True when file is cached
- with os.fdopen(fd, 'w') as cache:
+ with os.fdopen(fd, 'wb') as cache:
outstream = cache
blockiter = readblocks(instream)
firstblock = True
@@ -205,10 +206,10 @@ class GitFat(object):
outstream = outstreamclean
firstblock = False
h.update(block)
- bytes += len(block)
+ bytecount += len(block)
outstream.write(block)
outstream.flush()
- digest = h.hexdigest()
+ digest = h.hexdigest().encode('ASCII')
objfile = os.path.join(self.objdir, digest)
if not ishanging:
if os.path.exists(objfile):
@@ -218,7 +219,7 @@ class GitFat(object):
os.rename(tmpname, objfile)
self.verbose('git-fat filter-clean: caching to %s' % objfile)
cached = True
- outstreamclean.write(self.encode(digest, bytes))
+ outstreamclean.write(self.encode(digest, bytecount))
finally:
if not cached:
os.remove(tmpname)
@@ -239,16 +240,16 @@ class GitFat(object):
# Ensure streams are treated as binary
sys.stdin = ensure_binary_mode(sys.stdin)
sys.stdout = ensure_binary_mode(sys.stdout)
- result, bytes = self.decode_stream(sys.stdin)
- if isinstance(result, str): # We got a digest
+ result, bytecount = self.decode_stream(sys.stdin)
+ if isinstance(result, bytes): # We got a digest
objfile = os.path.join(self.objdir, result)
try:
- cat(open(objfile), sys.stdout)
self.verbose('git-fat filter-smudge: restoring from %s' % objfile)
- except IOError: # file not found
+ cat(open(objfile, 'rb'), sys.stdout)
+ except IOError: # file not found
self.verbose('git-fat filter-smudge: fat object missing %s' % objfile)
- sys.stdout.write(self.encode(result, bytes)) # could leave a better notice about how to recover this file
- else: # We have an iterable over the original input.
+ sys.stdout.write(self.encode(result, bytecount)) # could leave a better notice about how to recover this file
+ else: # We have an iterable over the original input.
self.verbose('git-fat filter-smudge: not a managed file')
cat_iter(result, sys.stdout)
def catalog_objects(self):
@@ -263,13 +264,13 @@ class GitFat(object):
p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
def cut_sha1hash(input, output):
for line in input:
- output.write(line.split()[0] + '\n')
+ output.write(line.split()[0] + b'\n')
output.close()
cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin))
cut_thread.start()
for line in p2.stdout:
objhash, objtype, size = line.split()
- if objtype == 'blob' and int(size) in self.magiclens:
+ if objtype == b'blob' and int(size) in self.magiclens:
try:
fathash = self.decode(subprocess.check_output(['git', 'cat-file', '-p', objhash]))[0]
referenced.add(fathash)
@@ -315,9 +316,9 @@ class GitFat(object):
pushall = '--all' in args
files = self.referenced_objects(all=pushall) & self.catalog_objects()
cmd = self.get_rsync_command(push=True)
- self.verbose('Executing: %s' % ' '.join(cmd))
+ self.verbose('Executing: %s' % b' '.join(cmd))
p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
- p.communicate(input='\x00'.join(files))
+ p.communicate(input=b'\x00'.join(files))
def checkout(self, show_orphans=False):
'Update any stale files in the present working tree'
for digest, fname in self.orphan_files():
@@ -349,9 +350,9 @@ class GitFat(object):
refargs['rev'] = rev
files = self.referenced_objects(**refargs) - self.catalog_objects()
cmd = self.get_rsync_command(push=False)
- self.verbose('Executing: %s' % ' '.join(cmd))
+ self.verbose('Executing: %s' % b' '.join(cmd))
p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
- p.communicate(input='\x00'.join(files))
+ p.communicate(input=(b'\x00'.join(files)))
self.checkout()
def cmd_checkout(self, args):
self.checkout(show_orphans=True)
@@ -401,6 +402,7 @@ class GitFat(object):
time1 = time.time()
self.verbose('%d of %d blobs are >= %d bytes [elapsed %.3fs]' % (numlarge, numblobs, threshsize, time1-time0))
def cmd_find(self, args):
+ # FIXME: Need input validation here
maxsize = int(args[0])
blobsizes = dict(self.gen_large_blobs('--all', maxsize))
time0 = time.time()
@@ -421,19 +423,20 @@ class GitFat(object):
revlist.wait()
difftree.wait()
def cmd_index_filter(self, args):
+ # FIXME: Need input validation here
manage_gitattributes = '--manage-gitattributes' in args
filelist = set(f.strip() for f in open(args[0]).readlines())
lsfiles = subprocess.Popen(['git', 'ls-files', '-s'], stdout=subprocess.PIPE)
updateindex = subprocess.Popen(['git', 'update-index', '--index-info'], stdin=subprocess.PIPE)
- for line in lsfiles.stdout:
- mode, sep, tail = line.partition(' ')
- blobhash, sep, tail = tail.partition(' ')
- stageno, sep, tail = tail.partition('\t')
+ for line in lsfiles.stdout.read():
+ mode, sep, tail = line.partition(b' ')
+ blobhash, sep, tail = tail.partition(b' ')
+ stageno, sep, tail = tail.partition(b'\t')
filename = tail.strip()
if filename not in filelist:
continue
# This file will contain the hash of the cleaned object
- hashfile = os.path.join(self.gitdir, 'fat', 'index-filter', blobhash)
+ hashfile = os.path.join(self.gitdir, b'fat', b'index-filter', blobhash)
try:
cleanedobj = open(hashfile).read().rstrip()
except IOError: