summaryrefslogtreecommitdiff
path: root/network/basics/get_url.py
blob: a15b78df4fe64592ff102f2eca16c5442e27c395 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
#!/usr/bin/python
# -*- coding: utf-8 -*-

# (c) 2012, Jan-Piet Mens <jpmens () gmail.com>
#
# This file is part of Ansible
#
# Ansible is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Ansible is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Ansible.  If not, see <http://www.gnu.org/licenses/>.
#
# see examples/playbooks/get_url.yml

import shutil
import datetime
import re
import tempfile

ANSIBLE_METADATA = {'status': ['stableinterface'],
                    'supported_by': 'core',
                    'version': '1.0'}

DOCUMENTATION = '''
---
module: get_url
short_description: Downloads files from HTTP, HTTPS, or FTP to node
description:
     - Downloads files from HTTP, HTTPS, or FTP to the remote server. The remote
       server I(must) have direct access to the remote resource.
     - By default, if an environment variable C(<protocol>_proxy) is set on
       the target host, requests will be sent through that proxy. This
       behaviour can be overridden by setting a variable for this task
       (see `setting the environment
       <http://docs.ansible.com/playbooks_environment.html>`_),
       or by using the use_proxy option.
     - HTTP redirects can redirect from HTTP to HTTPS so you should be sure that
       your proxy environment for both protocols is correct.
version_added: "0.6"
options:
  url:
    description:
      - HTTP, HTTPS, or FTP URL in the form (http|https|ftp)://[user[:pass]]@host.domain[:port]/path
    required: true
  dest:
    description:
      - absolute path of where to download the file to.
      - If C(dest) is a directory, either the server provided filename or, if
        none provided, the base name of the URL on the remote server will be
        used. If a directory, C(force) has no effect.
        If C(dest) is a directory, the file will always be
        downloaded (regardless of the force option), but replaced only if the contents changed.
    required: true
  tmp_dest:
    description:
      - absolute path of where temporary file is downloaded to.
      - Defaults to TMPDIR, TEMP or TMP env variables or a platform specific value
      - https://docs.python.org/2/library/tempfile.html#tempfile.tempdir
    required: false
    default: ''
    version_added: '2.1'
  force:
    description:
      - If C(yes) and C(dest) is not a directory, will download the file every
        time and replace the file if the contents change. If C(no), the file
        will only be downloaded if the destination does not exist. Generally
        should be C(yes) only for small local files. Prior to 0.6, this module
        behaved as if C(yes) was the default.
    version_added: "0.7"
    required: false
    choices: [ "yes", "no" ]
    default: "no"
    aliases: [ "thirsty" ]
  backup:
    description:
      - Create a backup file including the timestamp information so you can get
        the original file back if you somehow clobbered it incorrectly.
    required: false
    choices: [ "yes", "no" ]
    default: "no"
    version_added: '2.1'
  sha256sum:
    description:
      - If a SHA-256 checksum is passed to this parameter, the digest of the
        destination file will be calculated after it is downloaded to ensure
        its integrity and verify that the transfer completed successfully.
        This option is deprecated. Use 'checksum'.
    version_added: "1.3"
    required: false
    default: null
  checksum:
    description:
      - 'If a checksum is passed to this parameter, the digest of the
        destination file will be calculated after it is downloaded to ensure
        its integrity and verify that the transfer completed successfully.
        Format: <algorithm>:<checksum>, e.g.: checksum="sha256:D98291AC[...]B6DC7B97"
        If you worry about portability, only the sha1 algorithm is available
        on all platforms and python versions.  The third party hashlib
        library can be installed for access to additional algorithms.
        Additionally, if a checksum is passed to this parameter, and the file exist under
        the C(dest) location, the destination_checksum would be calculated, and if
        checksum equals destination_checksum, the file download would be skipped
        (unless C(force) is true). '
    version_added: "2.0"
    required: false
    default: null
  use_proxy:
    description:
      - if C(no), it will not use a proxy, even if one is defined in
        an environment variable on the target hosts.
    required: false
    default: 'yes'
    choices: ['yes', 'no']
  validate_certs:
    description:
      - If C(no), SSL certificates will not be validated. This should only be used
        on personally controlled sites using self-signed certificates.
    required: false
    default: 'yes'
    choices: ['yes', 'no']
  timeout:
    description:
      - Timeout in seconds for URL request
    required: false
    default: 10
    version_added: '1.8'
  headers:
    description:
        - 'Add custom HTTP headers to a request in the format "key:value,key:value"'
    required: false
    default: null
    version_added: '2.0'
  url_username:
    description:
      - The username for use in HTTP basic authentication. This parameter can be used
        without C(url_password) for sites that allow empty passwords.
    required: false
    version_added: '1.6'
  url_password:
    description:
        - The password for use in HTTP basic authentication. If the C(url_username)
          parameter is not specified, the C(url_password) parameter will not be used.
    required: false
    version_added: '1.6'
  force_basic_auth:
    version_added: '2.0'
    description:
      - httplib2, the library used by the uri module only sends authentication information when a webservice
        responds to an initial request with a 401 status. Since some basic auth services do not properly
        send a 401, logins will fail. This option forces the sending of the Basic authentication header
        upon initial request.
    required: false
    choices: [ "yes", "no" ]
    default: "no"
  others:
    description:
      - all arguments accepted by the M(file) module also work here
    required: false
# informational: requirements for nodes
requirements: [ ]
extends_documentation_fragment:
    - files
author: "Jan-Piet Mens (@jpmens)"
'''

EXAMPLES='''
- name: download foo.conf
  get_url: 
    url: http://example.com/path/file.conf 
    dest: /etc/foo.conf 
    mode: 0440

- name: download file and force basic auth
  get_url: 
    url: http://example.com/path/file.conf 
    dest: /etc/foo.conf 
    force_basic_auth: yes

- name: download file with custom HTTP headers
  get_url: 
    url: http://example.com/path/file.conf 
    dest: /etc/foo.conf 
    headers: 'key:value,key:value'

- name: download file with check (sha256)
  get_url: 
    url: http://example.com/path/file.conf 
    dest: /etc/foo.conf 
    checksum: sha256:b5bb9d8014a0f9b1d61e21e796d78dccdf1352f23cd32812f4850b878ae4944c

- name: download file with check (md5)
  get_url: 
    url: http://example.com/path/file.conf 
    dest: /etc/foo.conf
    checksum: md5:66dffb5228a211e61d6d7ef4a86f5758

- name: download file from a file path
  get_url: 
    url: "file:///tmp/afile.txt" 
    dest: /tmp/afilecopy.txt  
'''

from ansible.module_utils.six.moves.urllib.parse import urlsplit

# ==============================================================
# url handling

def url_filename(url):
    fn = os.path.basename(urlsplit(url)[2])
    if fn == '':
        return 'index.html'
    return fn

def url_get(module, url, dest, use_proxy, last_mod_time, force, timeout=10, headers=None, tmp_dest=''):
    """
    Download data from the url and store in a temporary file.

    Return (tempfile, info about the request)
    """

    rsp, info = fetch_url(module, url, use_proxy=use_proxy, force=force, last_mod_time=last_mod_time, timeout=timeout, headers=headers)

    if info['status'] == 304:
        module.exit_json(url=url, dest=dest, changed=False, msg=info.get('msg', ''))

    # Exceptions in fetch_url may result in a status -1, the ensures a proper error to the user in all cases
    if info['status'] == -1:
        module.fail_json(msg=info['msg'], url=url, dest=dest)

    if info['status'] != 200 and not url.startswith('file:/') and not (url.startswith('ftp:/') and info.get('msg', '').startswith('OK')):
        module.fail_json(msg="Request failed", status_code=info['status'], response=info['msg'], url=url, dest=dest)

    # create a temporary file and copy content to do checksum-based replacement
    if tmp_dest != '':
        # tmp_dest should be an existing dir
        tmp_dest_is_dir = os.path.isdir(tmp_dest)
        if not tmp_dest_is_dir:
            if os.path.exists(tmp_dest):
                module.fail_json(msg="%s is a file but should be a directory." % tmp_dest)
            else:
                module.fail_json(msg="%s directory does not exist." % tmp_dest)

        fd, tempname = tempfile.mkstemp(dir=tmp_dest)
    else:
        fd, tempname = tempfile.mkstemp()

    f = os.fdopen(fd, 'wb')
    try:
        shutil.copyfileobj(rsp, f)
    except Exception:
        err = get_exception()
        os.remove(tempname)
        module.fail_json(msg="failed to create temporary content file: %s" % str(err))
    f.close()
    rsp.close()
    return tempname, info

def extract_filename_from_headers(headers):
    """
    Extracts a filename from the given dict of HTTP headers.

    Looks for the content-disposition header and applies a regex.
    Returns the filename if successful, else None."""
    cont_disp_regex = 'attachment; ?filename="?([^"]+)'
    res = None

    if 'content-disposition' in headers:
        cont_disp = headers['content-disposition']
        match = re.match(cont_disp_regex, cont_disp)
        if match:
            res = match.group(1)
            # Try preventing any funny business.
            res = os.path.basename(res)

    return res


# ==============================================================
# main

def main():
    argument_spec = url_argument_spec()
    argument_spec.update(
        url = dict(required=True),
        dest = dict(required=True),
        backup = dict(default=False, type='bool'),
        sha256sum = dict(default=''),
        checksum = dict(default=''),
        timeout = dict(required=False, type='int', default=10),
        headers = dict(required=False, default=None),
        tmp_dest = dict(required=False, default=''),
    )

    module = AnsibleModule(
        # not checking because of daisy chain to file module
        argument_spec = argument_spec,
        add_file_common_args=True
    )

    url  = module.params['url']
    dest = os.path.expanduser(module.params['dest'])
    backup = module.params['backup']
    force = module.params['force']
    sha256sum = module.params['sha256sum']
    checksum = module.params['checksum']
    use_proxy = module.params['use_proxy']
    timeout = module.params['timeout']
    tmp_dest = os.path.expanduser(module.params['tmp_dest'])

    # Parse headers to dict
    if module.params['headers']:
        try:
            headers = dict(item.split(':', 1) for item in module.params['headers'].split(','))
        except:
            module.fail_json(msg="The header parameter requires a key:value,key:value syntax to be properly parsed.")
    else:
        headers = None

    dest_is_dir = os.path.isdir(dest)
    last_mod_time = None

    # workaround for usage of deprecated sha256sum parameter
    if sha256sum != '':
        checksum = 'sha256:%s' % (sha256sum)

    # checksum specified, parse for algorithm and checksum
    if checksum != '':
        try:
            algorithm, checksum = checksum.rsplit(':', 1)
            # Remove any non-alphanumeric characters, including the infamous
            # Unicode zero-width space
            checksum = re.sub(r'\W+', '', checksum).lower()
            # Ensure the checksum portion is a hexdigest
            int(checksum, 16)
        except ValueError:
            module.fail_json(msg="The checksum parameter has to be in format <algorithm>:<checksum>")

    if not dest_is_dir and os.path.exists(dest):
        checksum_mismatch = False

        # If the download is not forced and there is a checksum, allow
        # checksum match to skip the download.
        if not force and checksum != '':
            destination_checksum = module.digest_from_file(dest, algorithm)

            if checksum == destination_checksum:
                module.exit_json(msg="file already exists", dest=dest, url=url, changed=False)

            checksum_mismatch = True

        # Not forcing redownload, unless checksum does not match
        if not force and not checksum_mismatch:
            # allow file attribute changes
            module.params['path'] = dest
            file_args = module.load_file_common_arguments(module.params)
            file_args['path'] = dest
            changed = module.set_fs_attributes_if_different(file_args, False)

            if changed:
                module.exit_json(msg="file already exists but file attributes changed", dest=dest, url=url, changed=changed)
            module.exit_json(msg="file already exists", dest=dest, url=url, changed=changed)

        # If the file already exists, prepare the last modified time for the
        # request.
        mtime = os.path.getmtime(dest)
        last_mod_time = datetime.datetime.utcfromtimestamp(mtime)

        # If the checksum does not match we have to force the download
        # because last_mod_time may be newer than on remote
        if checksum_mismatch:
            force = True

    # download to tmpsrc
    tmpsrc, info = url_get(module, url, dest, use_proxy, last_mod_time, force, timeout, headers, tmp_dest)

    # Now the request has completed, we can finally generate the final
    # destination file name from the info dict.

    if dest_is_dir:
        filename = extract_filename_from_headers(info)
        if not filename:
            # Fall back to extracting the filename from the URL.
            # Pluck the URL from the info, since a redirect could have changed
            # it.
            filename = url_filename(info['url'])
        dest = os.path.join(dest, filename)

    checksum_src   = None
    checksum_dest  = None

    # raise an error if there is no tmpsrc file
    if not os.path.exists(tmpsrc):
        os.remove(tmpsrc)
        module.fail_json(msg="Request failed", status_code=info['status'], response=info['msg'])
    if not os.access(tmpsrc, os.R_OK):
        os.remove(tmpsrc)
        module.fail_json( msg="Source %s not readable" % (tmpsrc))
    checksum_src = module.sha1(tmpsrc)

    # check if there is no dest file
    if os.path.exists(dest):
        # raise an error if copy has no permission on dest
        if not os.access(dest, os.W_OK):
            os.remove(tmpsrc)
            module.fail_json( msg="Destination %s not writable" % (dest))
        if not os.access(dest, os.R_OK):
            os.remove(tmpsrc)
            module.fail_json( msg="Destination %s not readable" % (dest))
        checksum_dest = module.sha1(dest)
    else:
        if not os.access(os.path.dirname(dest), os.W_OK):
            os.remove(tmpsrc)
            module.fail_json( msg="Destination %s not writable" % (os.path.dirname(dest)))

    backup_file = None
    if checksum_src != checksum_dest:
        try:
            if backup:
                if os.path.exists(dest):
                    backup_file = module.backup_local(dest)
            shutil.copyfile(tmpsrc, dest)
        except Exception: 
            err = get_exception()
            os.remove(tmpsrc)
            module.fail_json(msg="failed to copy %s to %s: %s" % (tmpsrc, dest, str(err)))
        changed = True
    else:
        changed = False

    if checksum != '':
        destination_checksum = module.digest_from_file(dest, algorithm)

        if checksum != destination_checksum:
            os.remove(dest)
            module.fail_json(msg="The checksum for %s did not match %s; it was %s." % (dest, checksum, destination_checksum))

    os.remove(tmpsrc)

    # allow file attribute changes
    module.params['path'] = dest
    file_args = module.load_file_common_arguments(module.params)
    file_args['path'] = dest
    changed = module.set_fs_attributes_if_different(file_args, changed)

    # Backwards compat only.  We'll return None on FIPS enabled systems
    try:
        md5sum = module.md5(dest)
    except ValueError:
        md5sum = None

    res_args = dict(
        url = url, dest = dest, src = tmpsrc, md5sum = md5sum, checksum_src = checksum_src,
        checksum_dest = checksum_dest, changed = changed, msg = info.get('msg', '')
    )
    if backup_file:
        res_args['backup_file'] = backup_file

    # Mission complete
    module.exit_json(**res_args)

# import module snippets
from ansible.module_utils.basic import *
from ansible.module_utils.urls import *
if __name__ == '__main__':
    main()