proto_debug.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612
  1. # TODO: clean this file up more and heavily refactor
  2. ''' Helper functions for reverse engineering protobuf.
  3. Basic guide:
  4. Run interactively with python3 -i proto_debug.py
  5. The function dec will decode a base64 string
  6. (regardless of whether it includes = or %3D at the end) to a bytestring
  7. The function pb (parse_protobuf) will return a list of tuples.
  8. Each tuple is (wire_type, field_number, field_data)
  9. The function enc encodes as base64 (inverse of dec)
  10. The function uenc is like enc but replaces = with %3D
  11. See https://developers.google.com/protocol-buffers/docs/encoding#structure
  12. Example usage:
  13. >>> pb(dec('4qmFsgJcEhhVQ1lPX2phYl9lc3VGUlY0YjE3QUp0QXcaQEVnWjJhV1JsYjNNWUF5QUFNQUU0QWVvREdFTm5Ua1JSVlVWVFEzZHBYM2gwTTBaeFRuRkZiRFZqUWclM0QlM0Q%3D'))
  14. [(2, 80226972, b'\x12\x18UCYO_jab_esuFRV4b17AJtAw\x1a@EgZ2aWRlb3MYAyAAMAE4AeoDGENnTkRRVUVTQ3dpX3h0M0ZxTnFFbDVjQg%3D%3D')]
  15. >>> pb(b'\x12\x18UCYO_jab_esuFRV4b17AJtAw\x1a@EgZ2aWRlb3MYAyAAMAE4AeoDGENnTkRRVUVTQ3dpX3h0M0ZxTnFFbDVjQg%3D%3D')
  16. [(2, 2, b'UCYO_jab_esuFRV4b17AJtAw'), (2, 3, b'EgZ2aWRlb3MYAyAAMAE4AeoDGENnTkRRVUVTQ3dpX3h0M0ZxTnFFbDVjQg%3D%3D')]
  17. >>> pb(dec(b'EgZ2aWRlb3MYAyAAMAE4AeoDGENnTkRRVUVTQ3dpX3h0M0ZxTnFFbDVjQg%3D%3D'))
  18. [(2, 2, b'videos'), (0, 3, 3), (0, 4, 0), (0, 6, 1), (0, 7, 1), (2, 61, b'CgNDQUESCwi_xt3FqNqEl5cB')]
  19. >>> pb(dec(b'CgNDQUESCwi_xt3FqNqEl5cB'))
  20. [(2, 1, b'CAA'), (2, 2, b'\x08\xbf\xc6\xdd\xc5\xa8\xda\x84\x97\x97\x01')]
  21. >>> pb(b'\x08\xbf\xc6\xdd\xc5\xa8\xda\x84\x97\x97\x01')
  22. [(0, 1, 10893665244101960511)]
  23. >>> pb(dec(b'CAA'))
  24. [(0, 1, 0)]
  25. The function recursive_pb will try to do dec/pb recursively automatically.
  26. It's a dumb function (so might try to dec or pb something that isn't really
  27. base64 or protobuf) so be careful.
  28. The function pp will pretty print the recursive structure:
  29. >>> pp(recursive_pb('4qmFsgJcEhhVQ1lPX2phYl9lc3VGUlY0YjE3QUp0QXcaQEVnWjJhV1JsYjNNWUF5QUFNQUU0QWVvREdFTm5Ua1JSVlVWVFEzZHBYM2gwTTBaeFRuRkZiRFZqUWclM0QlM0Q%3D'))
  30. ('base64p',
  31. [
  32. [2, 80226972,
  33. [
  34. [2, 2, b'UCYO_jab_esuFRV4b17AJtAw'],
  35. [2, 3,
  36. ('base64p',
  37. [
  38. [2, 2, b'videos'],
  39. [0, 3, 3],
  40. [0, 4, 0],
  41. [0, 6, 1],
  42. [0, 7, 1],
  43. [2, 61,
  44. ('base64?',
  45. [
  46. [2, 1, b'CAA'],
  47. [2, 2,
  48. [
  49. [0, 1, 10893665244101960511],
  50. ]
  51. ],
  52. ]
  53. )
  54. ],
  55. ]
  56. )
  57. ],
  58. ]
  59. ],
  60. ]
  61. )
  62. - base64 means a base64 encode with equals sign paddings
  63. - base64s means a base64 encode without padding
  64. - base64p means a url base64 encode with equals signs replaced with %3D
  65. - base64? means the base64 type cannot be inferred because of the length
  66. make_proto is the inverse function. It will take a recursive_pb structure and
  67. make a ctoken out of it, so in general,
  68. x == make_proto(recursive_pb(x))
  69. There are some other functions I wrote while reverse engineering stuff
  70. that may or may not be useful.
  71. '''
  72. import urllib.request
  73. import urllib.parse
  74. import re
  75. import time
  76. import json
  77. import os
  78. import pprint
  79. # ------ from proto.py -----------------------------------------------
  80. from math import ceil
  81. import base64
  82. import io
  83. def byte(n):
  84. return bytes((n,))
  85. def varint_encode(offset):
  86. '''In this encoding system, for each 8-bit byte, the first bit is 1 if there are more bytes, and 0 is this is the last one.
  87. The next 7 bits are data. These 7-bit sections represent the data in Little endian order. For example, suppose the data is
  88. aaaaaaabbbbbbbccccccc (each of these sections is 7 bits). It will be encoded as:
  89. 1ccccccc 1bbbbbbb 0aaaaaaa
  90. This encoding is used in youtube parameters to encode offsets and to encode the length for length-prefixed data.
  91. See https://developers.google.com/protocol-buffers/docs/encoding#varints for more info.'''
  92. needed_bytes = ceil(offset.bit_length()/7) or 1 # (0).bit_length() returns 0, but we need 1 in that case.
  93. encoded_bytes = bytearray(needed_bytes)
  94. for i in range(0, needed_bytes - 1):
  95. encoded_bytes[i] = (offset & 127) | 128 # 7 least significant bits
  96. offset = offset >> 7
  97. encoded_bytes[-1] = offset & 127 # leave first bit as zero for last byte
  98. return bytes(encoded_bytes)
  99. def varint_decode(encoded):
  100. decoded = 0
  101. for i, byte in enumerate(encoded):
  102. decoded |= (byte & 127) << 7*i
  103. if not (byte & 128):
  104. break
  105. return decoded
  106. def string(field_number, data):
  107. data = as_bytes(data)
  108. return _proto_field(2, field_number, varint_encode(len(data)) + data)
  109. nested = string
  110. def uint(field_number, value):
  111. return _proto_field(0, field_number, varint_encode(value))
  112. def _proto_field(wire_type, field_number, data):
  113. ''' See https://developers.google.com/protocol-buffers/docs/encoding#structure '''
  114. return varint_encode((field_number << 3) | wire_type) + data
  115. def percent_b64encode(data):
  116. return base64.urlsafe_b64encode(data).replace(b'=', b'%3D')
  117. def unpadded_b64encode(data):
  118. return base64.urlsafe_b64encode(data).replace(b'=', b'')
  119. def as_bytes(value):
  120. if isinstance(value, str):
  121. return value.encode('utf-8')
  122. return value
  123. def read_varint(data):
  124. result = 0
  125. i = 0
  126. while True:
  127. try:
  128. byte = data.read(1)[0]
  129. except IndexError:
  130. if i == 0:
  131. raise EOFError()
  132. raise Exception('Unterminated varint starting at ' + str(data.tell() - i))
  133. result |= (byte & 127) << 7*i
  134. if not byte & 128:
  135. break
  136. i += 1
  137. return result
  138. def read_group(data, end_sequence):
  139. start = data.tell()
  140. index = data.original.find(end_sequence, start)
  141. if index == -1:
  142. raise Exception('Unterminated group')
  143. data.seek(index + len(end_sequence))
  144. return data.original[start:index]
  145. def parse(data, include_wire_type=False):
  146. '''Returns a dict mapping field numbers to values
  147. data is the protobuf structure, which must not be b64-encoded'''
  148. if include_wire_type:
  149. return {field_number: [wire_type, value]
  150. for wire_type, field_number, value in read_protobuf(data)}
  151. return {field_number: value
  152. for _, field_number, value in read_protobuf(data)}
  153. base64_enc_funcs = {
  154. 'base64': base64.urlsafe_b64encode,
  155. 'base64s': unpadded_b64encode,
  156. 'base64p': percent_b64encode,
  157. 'base64?': base64.urlsafe_b64encode,
  158. }
  159. def _make_protobuf(data):
  160. # must be dict mapping field_number to [wire_type, value]
  161. if isinstance(data, dict):
  162. new_data = []
  163. for field_num, (wire_type, value) in sorted(data.items()):
  164. new_data.append((wire_type, field_num, value))
  165. data = new_data
  166. if isinstance(data, str):
  167. return data.encode('utf-8')
  168. elif len(data) == 2 and data[0] in list(base64_enc_funcs.keys()):
  169. return base64_enc_funcs[data[0]](_make_protobuf(data[1]))
  170. elif isinstance(data, list):
  171. result = b''
  172. for field in data:
  173. if field[0] == 0:
  174. result += uint(field[1], field[2])
  175. elif field[0] == 2:
  176. result += string(field[1], _make_protobuf(field[2]))
  177. else:
  178. raise NotImplementedError('Wire type ' + str(field[0])
  179. + ' not implemented')
  180. return result
  181. return data
  182. def make_protobuf(data):
  183. return _make_protobuf(data).decode('ascii')
  184. make_proto = make_protobuf
  185. def _set_protobuf_value(data, *path, value):
  186. if not path:
  187. return value
  188. op = path[0]
  189. if op in base64_enc_funcs:
  190. inner_data = b64_to_bytes(data)
  191. return base64_enc_funcs[op](
  192. _set_protobuf_value(inner_data, *path[1:], value=value)
  193. )
  194. pb_dict = parse(data, include_wire_type=True)
  195. pb_dict[op][1] = _set_protobuf_value(
  196. pb_dict[op][1], *path[1:], value=value
  197. )
  198. return _make_protobuf(pb_dict)
  199. def set_protobuf_value(data, *path, value):
  200. '''Set a field's value in a raw protobuf structure
  201. path is a list of field numbers and/or base64 encoding directives
  202. The directives are
  203. base64: normal base64 encoding with equal signs padding
  204. base64s ("stripped"): no padding
  205. base64p: %3D instead of = for padding
  206. return new_protobuf, err'''
  207. try:
  208. new_protobuf = _set_protobuf_value(data, *path, value=value)
  209. return new_protobuf.decode('ascii'), None
  210. except Exception:
  211. return None, traceback.format_exc()
  212. def b64_to_bytes(data):
  213. if isinstance(data, bytes):
  214. data = data.decode('ascii')
  215. data = data.replace("%3D", "=")
  216. return base64.urlsafe_b64decode(data + "="*((4 - len(data) % 4) % 4))
  217. # --------------------------------------------------------------------
  218. dec = b64_to_bytes
  219. def get_b64_type(data):
  220. '''return base64, base64s, base64p, or base64?'''
  221. if isinstance(data, str):
  222. data = data.encode('ascii')
  223. if data.endswith(b'='):
  224. return 'base64'
  225. if data.endswith(b'%3D'):
  226. return 'base64p'
  227. # Length of data means it wouldn't have an equals sign,
  228. # so we can't tell which type it is.
  229. if len(data) % 4 == 0:
  230. return 'base64?'
  231. return 'base64s'
  232. def enc(t):
  233. return base64.urlsafe_b64encode(t).decode('ascii')
  234. def uenc(t):
  235. return enc(t).replace("=", "%3D")
  236. def b64_to_ascii(t):
  237. return base64.urlsafe_b64decode(t).decode('ascii', errors='replace')
  238. def b64_to_bin(t):
  239. decoded = base64.urlsafe_b64decode(t)
  240. # print(len(decoded)*8)
  241. return " ".join(["{:08b}".format(x) for x in decoded])
  242. def bytes_to_bin(t):
  243. return " ".join(["{:08b}".format(x) for x in t])
  244. def bin_to_bytes(t):
  245. return int(t, 2).to_bytes((len(t) + 7) // 8, 'big')
  246. def bytes_to_hex(t):
  247. return ' '.join(hex(n)[2:].zfill(2) for n in t)
  248. tohex = bytes_to_hex
  249. fromhex = bytes.fromhex
  250. def aligned_ascii(data):
  251. return ' '.join(' ' + chr(n) if n in range(32, 128) else ' _' for n in data)
  252. def parse_protobuf(data, mutable=False, spec=()):
  253. data_original = data
  254. data = io.BytesIO(data)
  255. data.original = data_original
  256. while True:
  257. try:
  258. tag = read_varint(data)
  259. except EOFError:
  260. break
  261. wire_type = tag & 7
  262. field_number = tag >> 3
  263. if wire_type == 0:
  264. value = read_varint(data)
  265. elif wire_type == 1:
  266. value = data.read(8)
  267. elif wire_type == 2:
  268. length = read_varint(data)
  269. value = data.read(length)
  270. elif wire_type == 3:
  271. end_bytes = varint_encode((field_number << 3) | 4)
  272. value = read_group(data, end_bytes)
  273. elif wire_type == 5:
  274. value = data.read(4)
  275. else:
  276. raise Exception("Unknown wire type: " + str(wire_type) + ", Tag: " + bytes_to_hex(varint_encode(tag)) + ", at position " + str(data.tell()))
  277. if mutable:
  278. yield [wire_type, field_number, value]
  279. else:
  280. yield (wire_type, field_number, value)
  281. read_protobuf = parse_protobuf
  282. def pb(data, mutable=False):
  283. return list(parse_protobuf(data, mutable=mutable))
  284. def bytes_to_base4(data):
  285. result = ''
  286. for b in data:
  287. result += str(b >> 6) + str((b >> 4) & 0b11) + str((b >> 2) & 0b11) + str(b & 0b11)
  288. return result
  289. import re
  290. import struct
  291. import binascii
  292. # Base32 encoding/decoding must be done in Python
  293. _b32alphabet = b'abcdefghijklmnopqrstuvwxyz012345'
  294. _b32tab2 = None
  295. _b32rev = None
  296. bytes_types = (bytes, bytearray) # Types acceptable as binary data
  297. def _bytes_from_decode_data(s):
  298. if isinstance(s, str):
  299. try:
  300. return s.encode('ascii')
  301. except UnicodeEncodeError:
  302. raise ValueError('string argument should contain only ASCII characters')
  303. if isinstance(s, bytes_types):
  304. return s
  305. try:
  306. return memoryview(s).tobytes()
  307. except TypeError:
  308. raise TypeError("argument should be a bytes-like object or ASCII "
  309. "string, not %r" % s.__class__.__name__) from None
  310. def b32decode(s, casefold=False, map01=None):
  311. """Decode the Base32 encoded bytes-like object or ASCII string s.
  312. Optional casefold is a flag specifying whether a lowercase alphabet is
  313. acceptable as input. For security purposes, the default is False.
  314. RFC 3548 allows for optional mapping of the digit 0 (zero) to the
  315. letter O (oh), and for optional mapping of the digit 1 (one) to
  316. either the letter I (eye) or letter L (el). The optional argument
  317. map01 when not None, specifies which letter the digit 1 should be
  318. mapped to (when map01 is not None, the digit 0 is always mapped to
  319. the letter O). For security purposes the default is None, so that
  320. 0 and 1 are not allowed in the input.
  321. The result is returned as a bytes object. A binascii.Error is raised if
  322. the input is incorrectly padded or if there are non-alphabet
  323. characters present in the input.
  324. """
  325. global _b32rev
  326. # Delay the initialization of the table to not waste memory
  327. # if the function is never called
  328. if _b32rev is None:
  329. _b32rev = {v: k for k, v in enumerate(_b32alphabet)}
  330. s = _bytes_from_decode_data(s)
  331. if len(s) % 8:
  332. raise binascii.Error('Incorrect padding')
  333. # Handle section 2.4 zero and one mapping. The flag map01 will be either
  334. # False, or the character to map the digit 1 (one) to. It should be
  335. # either L (el) or I (eye).
  336. if map01 is not None:
  337. map01 = _bytes_from_decode_data(map01)
  338. assert len(map01) == 1, repr(map01)
  339. s = s.translate(bytes.maketrans(b'01', b'O' + map01))
  340. if casefold:
  341. s = s.upper()
  342. # Strip off pad characters from the right. We need to count the pad
  343. # characters because this will tell us how many null bytes to remove from
  344. # the end of the decoded string.
  345. l = len(s)
  346. s = s.rstrip(b'=')
  347. padchars = l - len(s)
  348. # Now decode the full quanta
  349. decoded = bytearray()
  350. b32rev = _b32rev
  351. for i in range(0, len(s), 8):
  352. quanta = s[i: i + 8]
  353. acc = 0
  354. try:
  355. for c in quanta:
  356. acc = (acc << 5) + b32rev[c]
  357. except KeyError:
  358. raise binascii.Error('Non-base32 digit found') from None
  359. decoded += acc.to_bytes(5, 'big')
  360. # Process the last, partial quanta
  361. if padchars:
  362. acc <<= 5 * padchars
  363. last = acc.to_bytes(5, 'big')
  364. if padchars == 1:
  365. decoded[-5:] = last[:-1]
  366. elif padchars == 3:
  367. decoded[-5:] = last[:-2]
  368. elif padchars == 4:
  369. decoded[-5:] = last[:-3]
  370. elif padchars == 6:
  371. decoded[-5:] = last[:-4]
  372. else:
  373. raise binascii.Error('Incorrect padding')
  374. return bytes(decoded)
  375. def dec32(data):
  376. if isinstance(data, bytes):
  377. data = data.decode('ascii')
  378. return b32decode(data + "="*((8 - len(data)%8)%8))
  379. _patterns = [
  380. (b'UC', 24), # channel
  381. (b'PL', 34), # playlist
  382. (b'LL', 24), # liked videos playlist
  383. (b'UU', 24), # user uploads playlist
  384. (b'RD', 15), # radio mix
  385. (b'RD', 43), # radio mix
  386. (b'', 11), # video
  387. (b'Ug', 26), # comment
  388. (b'Ug', 49), # comment reply (of form parent_id.reply_id)
  389. (b'9', 22), # comment reply id
  390. ]
  391. def is_youtube_object_id(data):
  392. try:
  393. if isinstance(data, str):
  394. data = data.encode('ascii')
  395. except Exception:
  396. return False
  397. for start_sequence, length in _patterns:
  398. if len(data) == length and data.startswith(start_sequence):
  399. return True
  400. return False
  401. def recursive_pb(data):
  402. try:
  403. # check if this fits the basic requirements for base64
  404. if isinstance(data, str) or all(i > 32 for i in data):
  405. if len(data) > 11 and not is_youtube_object_id(data):
  406. raw_data = b64_to_bytes(data)
  407. b64_type = get_b64_type(data)
  408. rpb = recursive_pb(raw_data)
  409. if rpb == raw_data:
  410. # could not interpret as protobuf, probably not b64
  411. return data
  412. return (b64_type, rpb)
  413. else:
  414. return data
  415. except Exception as e:
  416. return data
  417. try:
  418. result = pb(data, mutable=True)
  419. except Exception as e:
  420. return data
  421. for tuple in result:
  422. if tuple[0] == 2:
  423. tuple[2] = recursive_pb(tuple[2])
  424. return result
  425. def indent_lines(lines, indent):
  426. return re.sub(r'^', ' '*indent, lines, flags=re.MULTILINE)
  427. def _pp(obj, indent): # not my best work
  428. if isinstance(obj, tuple):
  429. if len(obj) == 3: # (wire_type, field_number, data)
  430. return obj.__repr__()
  431. else: # (base64, [...])
  432. return ('(' + obj[0].__repr__() + ',\n'
  433. + indent_lines(_pp(obj[1], indent), indent) + '\n'
  434. + ')')
  435. elif isinstance(obj, list):
  436. # [wire_type, field_number, data]
  437. if (len(obj) == 3
  438. and not any(isinstance(x, (list, tuple)) for x in obj)
  439. ):
  440. return obj.__repr__()
  441. # [wire_type, field_number, [...]]
  442. elif (len(obj) == 3
  443. and not any(isinstance(x, (list, tuple)) for x in obj[0:2])
  444. ):
  445. return ('[' + obj[0].__repr__() + ', ' + obj[1].__repr__() + ',\n'
  446. + indent_lines(_pp(obj[2], indent), indent) + '\n'
  447. + ']')
  448. else:
  449. s = '[\n'
  450. for x in obj:
  451. s += indent_lines(_pp(x, indent), indent) + ',\n'
  452. s += ']'
  453. return s
  454. else:
  455. return obj.__repr__()
  456. def pp(obj, indent=1):
  457. '''Pretty prints the recursive pb structure'''
  458. print(_pp(obj, indent))
  459. desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0'
  460. desktop_headers = (
  461. ('Accept', '*/*'),
  462. ('Accept-Language', 'en-US,en;q=0.5'),
  463. ('X-YouTube-Client-Name', '1'),
  464. ('X-YouTube-Client-Version', '2.20180830'),
  465. ) + (('User-Agent', desktop_user_agent),)
  466. mobile_user_agent = 'Mozilla/5.0 (Linux; Android 7.0; Redmi Note 4 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36'
  467. mobile_headers = (
  468. ('Accept', '*/*'),
  469. ('Accept-Language', 'en-US,en;q=0.5'),
  470. ('X-YouTube-Client-Name', '2'),
  471. ('X-YouTube-Client-Version', '2.20180830'),
  472. ) + (('User-Agent', mobile_user_agent),)