get_files.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. # Copyright (c) 2013 The WebM project authors. All Rights Reserved.
  2. #
  3. # Use of this source code is governed by a BSD-style license
  4. # that can be found in the LICENSE file in the root of the source
  5. # tree. An additional intellectual property rights grant can be found
  6. # in the file PATENTS. All contributing project authors may
  7. # be found in the AUTHORS file in the root of the source tree.
  8. #
  9. # This simple script pulls test files from the webm homepage
  10. # It is intelligent enough to only pull files if
  11. # 1) File / test_data folder does not exist
  12. # 2) SHA mismatch
  13. import pycurl
  14. import csv
  15. import hashlib
  16. import re
  17. import os.path
  18. import time
  19. import itertools
  20. import sys
  21. import getopt
  22. #globals
  23. url = ''
  24. file_list_path = ''
  25. local_resource_path = ''
  26. # Helper functions:
  27. # A simple function which returns the sha hash of a file in hex
  28. def get_file_sha(filename):
  29. try:
  30. sha_hash = hashlib.sha1()
  31. with open(filename, 'rb') as file:
  32. buf = file.read(HASH_CHUNK)
  33. while len(buf) > 0:
  34. sha_hash.update(buf)
  35. buf = file.read(HASH_CHUNK)
  36. return sha_hash.hexdigest()
  37. except IOError:
  38. print "Error reading " + filename
  39. # Downloads a file from a url, and then checks the sha against the passed
  40. # in sha
  41. def download_and_check_sha(url, filename, sha):
  42. path = os.path.join(local_resource_path, filename)
  43. fp = open(path, "wb")
  44. curl = pycurl.Curl()
  45. curl.setopt(pycurl.URL, url + "/" + filename)
  46. curl.setopt(pycurl.WRITEDATA, fp)
  47. curl.perform()
  48. curl.close()
  49. fp.close()
  50. return get_file_sha(path) == sha
  51. #constants
  52. ftp_retries = 3
  53. SHA_COL = 0
  54. NAME_COL = 1
  55. EXPECTED_COL = 2
  56. HASH_CHUNK = 65536
  57. # Main script
  58. try:
  59. opts, args = \
  60. getopt.getopt(sys.argv[1:], \
  61. "u:i:o:", ["url=", "input_csv=", "output_dir="])
  62. except:
  63. print 'get_files.py -u <url> -i <input_csv> -o <output_dir>'
  64. sys.exit(2)
  65. for opt, arg in opts:
  66. if opt == '-u':
  67. url = arg
  68. elif opt in ("-i", "--input_csv"):
  69. file_list_path = os.path.join(arg)
  70. elif opt in ("-o", "--output_dir"):
  71. local_resource_path = os.path.join(arg)
  72. if len(sys.argv) != 7:
  73. print "Expects two paths and a url!"
  74. exit(1)
  75. if not os.path.isdir(local_resource_path):
  76. os.makedirs(local_resource_path)
  77. file_list_csv = open(file_list_path, "rb")
  78. # Our 'csv' file uses multiple spaces as a delimiter, python's
  79. # csv class only uses single character delimiters, so we convert them below
  80. file_list_reader = csv.reader((re.sub(' +', ' ', line) \
  81. for line in file_list_csv), delimiter = ' ')
  82. file_shas = []
  83. file_names = []
  84. for row in file_list_reader:
  85. if len(row) != EXPECTED_COL:
  86. continue
  87. file_shas.append(row[SHA_COL])
  88. file_names.append(row[NAME_COL])
  89. file_list_csv.close()
  90. # Download files, only if they don't already exist and have correct shas
  91. for filename, sha in itertools.izip(file_names, file_shas):
  92. path = os.path.join(local_resource_path, filename)
  93. if os.path.isfile(path) \
  94. and get_file_sha(path) == sha:
  95. print path + ' exists, skipping'
  96. continue
  97. for retry in range(0, ftp_retries):
  98. print "Downloading " + path
  99. if not download_and_check_sha(url, filename, sha):
  100. print "Sha does not match, retrying..."
  101. else:
  102. break