Retry on checksum failures
HTTP is a fun protocol. Size is basically optional. And clients implicitly trust the server and socket has transferred all the bytes. Which *really* means you should always checksum. But... previously we didn't checksum as part of retrying. So if anything happened with python-requests, or lower level library code or the system itself causing bytes to be lost off the buffer, creating an incomplete transfer situation, then we wouldn't know until the checksum. So now, we checksum and re-trigger the download if there is a failure of the checksum. This involved a minor shift in the download logic, and resulted in a needful minor fix to an image checksum test as it would loop for 90 seconds as well. Closes-Bug: 2038934 Change-Id: I543a60555a2621b49dd7b6564bd0654a46db2e9a
This commit is contained in:
3 changed files with 33 additions and 15 deletions
@@ -554,7 +554,9 @@ def _download_image(image_info):
msg = 'Unable to write image to {}. Error: {}'.format(
image_location, str(e))
raise errors.ImageDownloadError(image_info['id'], msg)
except errors.ImageDownloadError as e:
image_download.verify_image(image_location)
except (errors.ImageDownloadError,
errors.ImageChecksumError) as e:
if attempt == CONF.image_download_connection_retries:
raise
else:
@@ -575,7 +577,6 @@ def _download_image(image_info):
'totaltime': totaltime,
'size': image_download.bytes_transferred,
'reported': image_download.content_length})
image_download.verify_image(image_location)
def _validate_image_info(ext, image_info=None, **kwargs):
@@ -729,7 +730,12 @@ class StandbyExtension(base.BaseAgentExtension):
msg = ('Unable to write image to device {}. '
'Error: {}').format(device, str(e))
raise errors.ImageDownloadError(image_info['id'], msg)
except errors.ImageDownloadError as e:
# Verify the checksum of the streamed image is correct while
# still in the retry loop, so we can retry should a checksum
# failure be detected.
image_download.verify_image(device)
except (errors.ImageDownloadError,
errors.ImageChecksumError) as e:
if attempt == CONF.image_download_connection_retries:
raise
else:
@@ -749,8 +755,6 @@ class StandbyExtension(base.BaseAgentExtension):
{'device': device, 'totaltime': totaltime,
'size': image_download.bytes_transferred,
'reported': image_download.content_length})
# Verify if the checksum of the streamed image is correct
image_download.verify_image(device)
# Fix any gpt partition
try:
disk_utils.fix_gpt_partition(device, node_uuid=None)
@@ -441,6 +441,11 @@ class TestStandbyExtension(base.IronicAgentTest):
log_mock_calls = [
mock.call.info('Attempting to download image from %s',
'http://example.org'),
mock.call.debug('Verifying image at %(image_location)s against '
'%(algo_name)s checksum %(checksum)s',
{'image_location': mock.ANY,
'algo_name': mock.ANY,
'checksum': 'fake-checksum'}),
mock.call.info('Image downloaded from %(image_location)s in '
'%(totaltime)s seconds. Transferred %(size)s'
'bytes. Server originaly reported: %(reported)s.',
@@ -448,11 +453,6 @@ class TestStandbyExtension(base.IronicAgentTest):
'totaltime': mock.ANY,
'size': 11,
'reported': None}),
mock.call.debug('Verifying image at %(image_location)s against '
'%(algo_name)s checksum %(checksum)s',
{'image_location': mock.ANY,
'algo_name': mock.ANY,
'checksum': 'fake-checksum'})
]
log_mock.assert_has_calls(log_mock_calls)
@@ -509,6 +509,9 @@ class TestStandbyExtension(base.IronicAgentTest):
@mock.patch('requests.get', autospec=True)
def test_download_image_verify_fails(self, requests_mock, open_mock,
hash_mock):
# Set the config to 0 retries, so we don't retry in this case
# and cause the test download to loop multiple times.
self.config(image_download_connection_retries=0)
image_info = _build_fake_image_info()
response = requests_mock.return_value
response.status_code = 200
@@ -1334,6 +1337,11 @@ class TestStandbyExtension(base.IronicAgentTest):
mock_log_calls = [
mock.call.info('Attempting to download image from %s',
'http://example.org'),
mock.call.debug('Verifying image at %(image_location)s'
'against %(algo_name)s checksum %(checksum)s',
{'image_location': '/dev/foo',
'algo_name': mock.ANY,
'checksum': 'fake-checksum'}),
mock.call.info('Image streamed onto device %(device)s in '
'%(totaltime)s seconds for %(size)s bytes. '
'Server originaly reported %(reported)s.',
@@ -1341,11 +1349,6 @@ class TestStandbyExtension(base.IronicAgentTest):
'totaltime': mock.ANY,
'size': 11,
'reported': 11}),
mock.call.debug('Verifying image at %(image_location)s'
'against %(algo_name)s checksum %(checksum)s',
{'image_location': '/dev/foo',
'algo_name': mock.ANY,
'checksum': 'fake-checksum'}),
mock.call.info('%(device)s UUID is now %(root_uuid)s',
{'device': '/dev/foo', 'root_uuid': 'aaaabbbb'})
]
@@ -0,0 +1,11 @@
---
fixes:
- |
Fixes a failure case where downloads would not be retried when the
checksum fails verification. the agent now includes the checksum
activity as part of the file download operation, and will
automatically retry downloads when the checksum fails in
accordance with the existing download retry logic.
This is largely in response to what appears to be intermittent
transport failures at lower levels which we cannot otherwise
detect.
Reference in New Issue
Block a user
Blocking a user prevents them from interacting with repositories, such as opening or commenting on pull requests or issues. Learn more about blocking a user.