download-jamendo.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. #!/usr/bin/env python
  2. # Jamendo database dumps can be fetched from: http://img.jamendo.com/data/dbdump_artistalbumtrack.xml.gz
  3. import xml.etree.cElementTree as ElementTree
  4. import sys, gzip, time, os, os.path, urllib, threading, statvfs, magic
  5. JAMENDO_DUMP_URL="http://img.jamendo.com/data/dbdump_artistalbumtrack.xml.gz"
  6. MAX_THREADS = 10
  7. MAX_RETRIES = 5
  8. running_threads = 0
  9. class Downloader(threading.Thread):
  10. def __init__(self, filename, url):
  11. global running_threads
  12. threading.Thread.__init__(self)
  13. self.filename = filename
  14. self.url = url
  15. running_threads += 1
  16. def run(self):
  17. global running_threads
  18. global MAX_RETRIES
  19. correct_mime = "application/ogg; charset=binary"
  20. m = magic.open(magic.MAGIC_MIME)
  21. m.load()
  22. retries = 0
  23. current_mime = ""
  24. while retries < MAX_RETRIES and current_mime != correct_mime:
  25. urllib.urlretrieve(self.url, self.filename)
  26. current_mime = m.file(self.filename)
  27. retries += 1
  28. if current_mime != correct_mime:
  29. os.rename(self.filename, '%s.ign' % self.filename[:-4])
  30. running_threads -= 1
  31. class DownloadJamendo:
  32. def __init__(self, destination, dump):
  33. if not os.path.exists(destination):
  34. os.mkdir(destination)
  35. self.destination = destination
  36. self.MAX_FILENAME_LENGTH = os.statvfs(destination)[statvfs.F_NAMEMAX]
  37. self.dump = dump or None
  38. if not self.dump:
  39. print "Downloading Jamendo dump from %s" % JAMENDO_DUMP_URL
  40. (filename, headers) = urllib.urlretrieve(JAMENDO_DUMP_URL, os.path.join(destination, "dbdump_artistalbumtrack.xml.gz"))
  41. print "Jamendo dump saved: %s" % filename
  42. self.dump = gzip.open(filename, "r")
  43. def parse(self):
  44. for event, elem in ElementTree.iterparse(self.dump):
  45. if elem.tag == "artist":
  46. artist = self.proc_artist(elem)
  47. self.download_artist(artist)
  48. def proc_artist(self, elem):
  49. artist = {}
  50. artist["albums"] = []
  51. for artist_e in elem.getchildren():
  52. if artist_e.tag == "name":
  53. artist["name"] = artist_e.text
  54. if artist_e.tag == "Albums":
  55. for album_e in artist_e.getchildren():
  56. artist["albums"].append(self.proc_album(album_e))
  57. return artist
  58. def proc_album(self, elem):
  59. album = {}
  60. album["tracks"] = []
  61. album["name"] = None
  62. for album_e in elem.getchildren():
  63. if album_e.tag == "name":
  64. album["name"] = album_e.text
  65. if album_e.tag == "Tracks":
  66. for track_e in album_e.getchildren():
  67. album["tracks"].append(self.proc_track(track_e))
  68. return album
  69. def proc_track(self, elem):
  70. track = {}
  71. track["id"] = None
  72. track["name"] = None
  73. track["license"] = None
  74. for track_e in elem.getchildren():
  75. if track_e.tag == "id":
  76. track["id"] = int(track_e.text)
  77. if track_e.tag == "name":
  78. track["name"] = track_e.text
  79. if track_e.tag == "license":
  80. track["license"] = track_e.text
  81. return track
  82. def download_artist(self, artist):
  83. global running_threads
  84. for album in artist["albums"]:
  85. for track in album["tracks"]:
  86. if track["id"] and track["name"] and album["name"] and artist["name"] and self.free_license(track["license"]):
  87. trackurl = "http://api.jamendo.com/get2/stream/track/redirect/?id=%d&streamencoding=ogg2" % track["id"]
  88. trackfile = "%s-%s-%s-%s" % (track["id"], artist["name"].replace("/", ""), album["name"].replace("/", ""), track["name"].replace("/", " "))
  89. trackfile = "%s.ogg" % trackfile.encode('utf-8')[:self.MAX_FILENAME_LENGTH-4].decode('utf-8','ignore').encode('utf-8')
  90. ignorefile = "%s.ign" % trackfile[:-4]
  91. trackfilepath = os.path.join(self.destination, trackfile)
  92. ignorefilepath = os.path.join(self.destination, ignorefile)
  93. if os.path.exists(ignorefilepath):
  94. print "Found ignore file for %s" % trackfile
  95. continue
  96. if os.path.exists(trackfilepath):
  97. if os.path.getsize(trackfilepath) < 1024:
  98. print "Removing file with size below 1024 bytes: %s" % trackfilepath
  99. os.remove(trackfilepath)
  100. else:
  101. print "Already downloaded track %s" % trackfilepath
  102. continue
  103. while running_threads > MAX_THREADS:
  104. time.sleep(1)
  105. print "Downloading %s to %s" % (trackurl, trackfilepath)
  106. d = Downloader(trackfilepath, trackurl)
  107. d.start()
  108. tracksymlink = os.path.join(self.destination, "%s.ogg2" % track["id"])
  109. if os.path.lexists(tracksymlink):
  110. os.remove(tracksymlink)
  111. os.symlink(trackfile, tracksymlink)
  112. # 1 second delay between every new request to be nice to server
  113. time.sleep(1)
  114. def free_license(self, license):
  115. return ("http://creativecommons.org/licenses/by-sa" in license or "http://creativecommons.org/licenses/by/" in license or "http://artlibre.org/licence.php/lal.html" in license)
  116. if __name__ == "__main__":
  117. if len(sys.argv) < 2:
  118. print "Usage: download-jamendo.py [<database dump>] <destination>"
  119. sys.exit(1)
  120. if len(sys.argv) == 3:
  121. destination = sys.argv[2]
  122. if sys.argv[1][-2:] == "gz":
  123. dump = gzip.open(sys.argv[1], "r")
  124. else:
  125. dump = open(sys.argv[1], "r")
  126. else:
  127. destination = sys.argv[1]
  128. dump = None
  129. downloader = DownloadJamendo(destination, dump)
  130. downloader.parse()