diff options
author | Samuel Merritt <sam@swiftstack.com> | 2018-01-26 16:51:03 -0800 |
---|---|---|
committer | Samuel Merritt <sam@swiftstack.com> | 2018-02-02 11:30:49 -0800 |
commit | 98d185905a173fd7cbcbe1f6a71b74eb7564851c (patch) | |
tree | f5edd507e445141d7ab375b0f1a66db5f7af9480 /swift/common/request_helpers.py | |
parent | d6e911c623e9fdc921e5a1e23bb4bf28b228d983 (diff) | |
download | swift-98d185905a173fd7cbcbe1f6a71b74eb7564851c.tar.gz |
Cleanup for iterators in SegmentedIterable
We had a pair of large, complicated iterators to handle fetching all
the segment data, and they were hard to read and think about. I tried
to break them out into some simpler pieces:
* one to handle coalescing multiple requests to the same segment
* one to handle fetching the bytes from each segment
* one to check that the download isn't taking too long
* one to count the bytes and make sure we sent the right number
* one to catch errors and handle cleanup
It's more nesting, but each level now does just one thing.
Change-Id: If6f5cbd79edeff6ecb81350792449ce767919bcc
Diffstat (limited to 'swift/common/request_helpers.py')
-rw-r--r-- | swift/common/request_helpers.py | 197 |
1 files changed, 101 insertions, 96 deletions
diff --git a/swift/common/request_helpers.py b/swift/common/request_helpers.py index ec6ceb2c9..6b80f16e9 100644 --- a/swift/common/request_helpers.py +++ b/swift/common/request_helpers.py @@ -353,7 +353,6 @@ class SegmentedIterable(object): self.current_resp = None def _coalesce_requests(self): - start_time = time.time() pending_req = pending_etag = pending_size = None try: for seg_dict in self.listing_iter: @@ -376,11 +375,6 @@ class SegmentedIterable(object): first_byte = first_byte or 0 go_to_end = last_byte is None or ( seg_size is not None and last_byte == seg_size - 1) - if time.time() - start_time > self.max_get_time: - raise SegmentError( - 'While processing manifest %s, ' - 'max LO GET time of %ds exceeded' % - (self.name, self.max_get_time)) # The "multipart-manifest=get" query param ensures that the # segment is a plain old object, not some flavor of large # object; therefore, its etag is its MD5sum and hence we can @@ -433,108 +427,119 @@ class SegmentedIterable(object): except ListingIterError: e_type, e_value, e_traceback = sys.exc_info() - if time.time() - start_time > self.max_get_time: - raise SegmentError( - 'While processing manifest %s, ' - 'max LO GET time of %ds exceeded' % - (self.name, self.max_get_time)) if pending_req: yield pending_req, pending_etag, pending_size six.reraise(e_type, e_value, e_traceback) - if time.time() - start_time > self.max_get_time: - raise SegmentError( - 'While processing manifest %s, ' - 'max LO GET time of %ds exceeded' % - (self.name, self.max_get_time)) if pending_req: yield pending_req, pending_etag, pending_size - def _internal_iter(self): + def _requests_to_bytes_iter(self): + # Take the requests out of self._coalesce_requests, actually make + # the requests, and generate the bytes from the responses. + # + # Yields 2-tuples (segment-name, byte-chunk). The segment name is + # used for logging. + for data_or_req, seg_etag, seg_size in self._coalesce_requests(): + if isinstance(data_or_req, bytes): # ugly, awful overloading + yield ('data segment', data_or_req) + continue + seg_req = data_or_req + seg_resp = seg_req.get_response(self.app) + if not is_success(seg_resp.status_int): + close_if_possible(seg_resp.app_iter) + raise SegmentError( + 'While processing manifest %s, ' + 'got %d while retrieving %s' % + (self.name, seg_resp.status_int, seg_req.path)) + + elif ((seg_etag and (seg_resp.etag != seg_etag)) or + (seg_size and (seg_resp.content_length != seg_size) and + not seg_req.range)): + # The content-length check is for security reasons. Seems + # possible that an attacker could upload a >1mb object and + # then replace it with a much smaller object with same + # etag. Then create a big nested SLO that calls that + # object many times which would hammer our obj servers. If + # this is a range request, don't check content-length + # because it won't match. + close_if_possible(seg_resp.app_iter) + raise SegmentError( + 'Object segment no longer valid: ' + '%(path)s etag: %(r_etag)s != %(s_etag)s or ' + '%(r_size)s != %(s_size)s.' % + {'path': seg_req.path, 'r_etag': seg_resp.etag, + 'r_size': seg_resp.content_length, + 's_etag': seg_etag, + 's_size': seg_size}) + else: + self.current_resp = seg_resp + + seg_hash = None + if seg_resp.etag and not seg_req.headers.get('Range'): + # Only calculate the MD5 if it we can use it to validate + seg_hash = hashlib.md5() + + document_iters = maybe_multipart_byteranges_to_document_iters( + seg_resp.app_iter, + seg_resp.headers['Content-Type']) + + for chunk in itertools.chain.from_iterable(document_iters): + if seg_hash: + seg_hash.update(chunk) + yield (seg_req.path, chunk) + close_if_possible(seg_resp.app_iter) + + if seg_hash and seg_hash.hexdigest() != seg_resp.etag: + raise SegmentError( + "Bad MD5 checksum in %(name)s for %(seg)s: headers had" + " %(etag)s, but object MD5 was actually %(actual)s" % + {'seg': seg_req.path, 'etag': seg_resp.etag, + 'name': self.name, 'actual': seg_hash.hexdigest()}) + + def _byte_counting_iter(self): + # Checks that we give the client the right number of bytes. Raises + # SegmentError if the number of bytes is wrong. bytes_left = self.response_body_length - try: - for data_or_req, seg_etag, seg_size in self._coalesce_requests(): - if isinstance(data_or_req, bytes): - chunk = data_or_req # ugly, awful overloading - if bytes_left is None: - yield chunk - elif bytes_left >= len(chunk): - yield chunk - bytes_left -= len(chunk) - else: - yield chunk[:bytes_left] - continue - seg_req = data_or_req - seg_resp = seg_req.get_response(self.app) - if not is_success(seg_resp.status_int): - close_if_possible(seg_resp.app_iter) - raise SegmentError( - 'While processing manifest %s, ' - 'got %d while retrieving %s' % - (self.name, seg_resp.status_int, seg_req.path)) - - elif ((seg_etag and (seg_resp.etag != seg_etag)) or - (seg_size and (seg_resp.content_length != seg_size) and - not seg_req.range)): - # The content-length check is for security reasons. Seems - # possible that an attacker could upload a >1mb object and - # then replace it with a much smaller object with same - # etag. Then create a big nested SLO that calls that - # object many times which would hammer our obj servers. If - # this is a range request, don't check content-length - # because it won't match. - close_if_possible(seg_resp.app_iter) - raise SegmentError( - 'Object segment no longer valid: ' - '%(path)s etag: %(r_etag)s != %(s_etag)s or ' - '%(r_size)s != %(s_size)s.' % - {'path': seg_req.path, 'r_etag': seg_resp.etag, - 'r_size': seg_resp.content_length, - 's_etag': seg_etag, - 's_size': seg_size}) - else: - self.current_resp = seg_resp - - seg_hash = None - if seg_resp.etag and not seg_req.headers.get('Range'): - # Only calculate the MD5 if it we can use it to validate - seg_hash = hashlib.md5() - - document_iters = maybe_multipart_byteranges_to_document_iters( - seg_resp.app_iter, - seg_resp.headers['Content-Type']) - - for chunk in itertools.chain.from_iterable(document_iters): - if seg_hash: - seg_hash.update(chunk) - - if bytes_left is None: - yield chunk - elif bytes_left >= len(chunk): - yield chunk - bytes_left -= len(chunk) - else: - yield chunk[:bytes_left] - bytes_left -= len(chunk) - close_if_possible(seg_resp.app_iter) - raise SegmentError( - 'Too many bytes for %(name)s; truncating in ' - '%(seg)s with %(left)d bytes left' % - {'name': self.name, 'seg': seg_req.path, - 'left': bytes_left}) - close_if_possible(seg_resp.app_iter) + for seg_name, chunk in self._requests_to_bytes_iter(): + if bytes_left is None: + yield chunk + elif bytes_left >= len(chunk): + yield chunk + bytes_left -= len(chunk) + else: + yield chunk[:bytes_left] + bytes_left -= len(chunk) + raise SegmentError( + 'Too many bytes for %(name)s; truncating in ' + '%(seg)s with %(left)d bytes left' % + {'name': self.name, 'seg': seg_name, + 'left': bytes_left}) - if seg_hash and seg_hash.hexdigest() != seg_resp.etag: - raise SegmentError( - "Bad MD5 checksum in %(name)s for %(seg)s: headers had" - " %(etag)s, but object MD5 was actually %(actual)s" % - {'seg': seg_req.path, 'etag': seg_resp.etag, - 'name': self.name, 'actual': seg_hash.hexdigest()}) + if bytes_left: + raise SegmentError( + 'Not enough bytes for %s; closing connection' % self.name) - if bytes_left: + def _time_limited_iter(self): + # Makes sure a GET response doesn't take more than self.max_get_time + # seconds to process. Raises an exception if things take too long. + start_time = time.time() + for chunk in self._byte_counting_iter(): + now = time.time() + yield chunk + if now - start_time > self.max_get_time: raise SegmentError( - 'Not enough bytes for %s; closing connection' % self.name) + 'While processing manifest %s, ' + 'max LO GET time of %ds exceeded' % + (self.name, self.max_get_time)) + + def _internal_iter(self): + # Top level of our iterator stack: pass bytes through; catch and + # handle exceptions. + try: + for chunk in self._time_limited_iter(): + yield chunk except (ListingIterError, SegmentError) as err: self.logger.error(err) if not self.validated_first_segment: |