git_parser.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. """
  2. Folder parser .git
  3. """
  4. import collections
  5. import zlib
  6. import os
  7. import argparse
  8. # The only non - "stock" library in this project is graphviz
  9. # used exclusively to reduce the number of lines in the code
  10. from graphviz import Digraph
  11. class GitTree:
  12. """
  13. Object Tree class
  14. """
  15. type = b'tree'
  16. def __init__(self, data, c):
  17. self.items = GitTree.tree_parse(data)
  18. self.my_hash = c
  19. self.trees = [] # list of subtrees
  20. self.blobs = [] # list of blobs
  21. @staticmethod
  22. def get_actual_data(raw, start=0):
  23. """
  24. Get data from a tree file
  25. """
  26. # find first space
  27. x = raw.find(b' ', start)
  28. # Read the mode of working with the file
  29. mode = raw[start:x]
  30. # Find the end of the "file path"
  31. y = raw.find(b'\x00', x)
  32. # read the file path
  33. path = raw[x + 1:y]
  34. # Read hash
  35. sha = hex(
  36. int.from_bytes(
  37. raw[y + 1:y + 21], "big"))[2:]
  38. return y + 21, GitTreeLeaf(mode, path, sha)
  39. @staticmethod
  40. def tree_parse(raw):
  41. """
  42. Parse tree
  43. """
  44. pos = 0
  45. max = len(raw)
  46. ret = list()
  47. while pos < max:
  48. pos, data = GitTree.get_actual_data(raw, pos)
  49. ret.append(data)
  50. return ret
  51. class GitCommit:
  52. """
  53. Commit class
  54. """
  55. type = b'commit'
  56. def __init__(self, data, c):
  57. self.data = GitCommit.data_parse(data)
  58. self.my_hash = c
  59. self.tree = None
  60. def set_tree(self, tree: GitTree):
  61. """
  62. Set a tree for a commit
  63. """
  64. self.tree = tree
  65. @staticmethod
  66. def data_parse(raw, start=0, dct=None):
  67. """
  68. Function for parsing information from a commit
  69. :param raw: data
  70. :param start: cursor position, where we start reading from
  71. :param dct: the dictionary where the data is written
  72. """
  73. if not dct:
  74. dct = collections.OrderedDict()
  75. # we are looking for the next place where in theory there can be useful data
  76. spc = raw.find(b' ', start)
  77. nl = raw.find(b'\n', start)
  78. # if the transition to a new line is earlier than the nearest space or
  79. # spaces are missing, then the remaining data is a commit message
  80. if (spc < 0) or (nl < spc):
  81. dct[b''] = raw[start + 1:]
  82. return dct
  83. # read the key for the following data
  84. key = raw[start:spc]
  85. # find the end of the value whose key we read
  86. end = start
  87. while True:
  88. end = raw.find(b'\n', end + 1)
  89. if raw[end + 1] != ord(' '):
  90. break
  91. # write variable
  92. value = raw[spc + 1:end].replace(b'\n ', b'\n')
  93. # checking not to overwrite the data for the key,
  94. # a add new ones
  95. if key in dct:
  96. if type(dct[key]) == list:
  97. dct[key].append(value)
  98. else:
  99. dct[key] = [dct[key], value]
  100. else:
  101. dct[key] = value
  102. return GitCommit.data_parse(raw, start=end + 1, dct=dct)
  103. class GitBlob:
  104. """
  105. Blob class
  106. """
  107. type = b'blob'
  108. def __init__(self, data, c):
  109. self.blobData = data
  110. self.my_hash = c
  111. class GitTreeLeaf(object):
  112. """
  113. Class of tree elements.
  114. In the instances of the class, we write the information that
  115. we read from the files in the folder .git/objects.
  116. And the instances of GitBlob, GitTree, and GitCommit contain data
  117. that we parsed and analyzed.
  118. """
  119. def __init__(self, mode, path, sha):
  120. self.mode = mode
  121. self.path = path
  122. self.sha = sha
  123. class Reader:
  124. """
  125. A class that reads data from .git/objects
  126. """
  127. def __init__(self, path: str):
  128. self.objects = []
  129. try:
  130. os.chdir(path + '/.git/objects')
  131. self.read_objects_folder()
  132. except FileNotFoundError:
  133. print('Can not find .git in '+path)
  134. def read_objects_folder(self):
  135. """
  136. Read all the files inside .git/objects
  137. (except for some)
  138. """
  139. dirs = [name for name in os.listdir(".") if os.path.isdir(name)]
  140. dirs.remove('pack')
  141. dirs.remove('info')
  142. # print(dirs)
  143. for i in dirs:
  144. # print('reading in '+i)
  145. os.chdir(i)
  146. a = Reader.read_objects(i)
  147. if isinstance(a, list):
  148. for j in a:
  149. self.objects.append(j)
  150. else:
  151. self.objects.append(a)
  152. os.chdir('..')
  153. @staticmethod
  154. def read_objects(s: str):
  155. """
  156. Read the files inside the folder and parse them
  157. """
  158. files = [name for name in os.listdir(".")]
  159. res = []
  160. # print(files)
  161. for i in files:
  162. with open(i, "rb") as f:
  163. raw = zlib.decompress(f.read())
  164. # Understand what kind of object it is
  165. x = raw.find(b' ')
  166. fmt = raw[0:x]
  167. # Skip Null terminator
  168. y = raw.find(b'\x00', x)
  169. # Create the desired object
  170. if fmt == b'tree':
  171. c = GitTree
  172. elif fmt == b'blob':
  173. c = GitBlob
  174. elif fmt == b'commit':
  175. c = GitCommit
  176. else:
  177. c = None
  178. if c:
  179. res.append(c(raw[y + 1:], s + i))
  180. return res
  181. class DependenciesResolver:
  182. """
  183. A class that, by hash, restores the hierarchy of objects
  184. """
  185. def __init__(self, objects: list):
  186. self.commits = []
  187. self.trees = []
  188. self.blobs = []
  189. self.set_commits_trees_blobs(objects)
  190. self.set_trees()
  191. def set_commits_trees_blobs(self, objects: list):
  192. """
  193. Sort all of the objects on the commits, trees and blobs
  194. """
  195. for i in objects:
  196. if isinstance(i, GitCommit):
  197. self.commits.append(i)
  198. if isinstance(i, GitTree):
  199. self.trees.append(i)
  200. if isinstance(i, GitBlob):
  201. self.blobs.append(i)
  202. def set_trees(self):
  203. """
  204. Recursively install the dependencies between the trees and blobs
  205. """
  206. for i in self.commits:
  207. h = i.data[b'tree']
  208. i.set_tree(self.__get_tree(h))
  209. for i in self.trees:
  210. for j in i.items:
  211. b = bytes(j.sha, 'utf-8')
  212. tmp = self.__get_tree(b)
  213. if tmp is None:
  214. tmp = self.__get_blob(b)
  215. if tmp is not None:
  216. i.blobs.append(tmp)
  217. else:
  218. i.trees.append(tmp)
  219. def __get_tree(self, h: bytes):
  220. """
  221. Get a tree by hash
  222. """
  223. for i in self.trees:
  224. if h.decode("utf-8") == i.my_hash:
  225. return i
  226. def __get_blob(self, h: bytes):
  227. """
  228. Get a blob by hash
  229. """
  230. for i in self.blobs:
  231. if h.decode("utf-8") == i.my_hash:
  232. return i
  233. class MakeGraph:
  234. """
  235. The class that draws the graph
  236. """
  237. def __init__(self, d: DependenciesResolver):
  238. self.resolver = d
  239. self.graph = Digraph(comment='Monster git parser', format='svg', strict=True)
  240. self.make_basic_nodes()
  241. self.parse_deps()
  242. def make_basic_nodes(self):
  243. """
  244. Create beautiful nodes for commits
  245. """
  246. self.graph.attr('node', shape='square', color='gold1', style='filled')
  247. for commit in self.resolver.commits:
  248. commit_msg = commit.data[b''].decode('utf-8')
  249. commit_sha = commit.my_hash
  250. self.graph.node(commit_sha, commit_msg)
  251. def parse_tree(self, tree: GitTree, parent: str, path: str):
  252. """
  253. Recursively add trees and blobs to the graph
  254. """
  255. self.graph.attr('node', shape='doublecircle')
  256. self.graph.node(tree.my_hash, path)
  257. self.graph.edge(parent, tree.my_hash)
  258. for item in tree.items:
  259. # if object is file
  260. if item.mode == b'100644':
  261. path = item.path.decode('utf-8')
  262. if MakeGraph.need_to_draw(path):
  263. i_hash = item.sha
  264. cur_item = i_hash + path
  265. self.graph.attr('node', shape='circle')
  266. self.graph.node(cur_item, path)
  267. self.graph.edge(tree.my_hash, cur_item)
  268. blob = ''
  269. for i in tree.blobs:
  270. if i.my_hash == item.sha:
  271. blob = i
  272. break
  273. # match the file to its contents
  274. if isinstance(blob, GitBlob) and blob.blobData.decode("utf-8") != '':
  275. self.graph.attr('node', shape='egg')
  276. # we will output only the first 20 characters of the file to the diagram
  277. self.graph.node(blob.my_hash, blob.blobData.decode("utf-8")[:20])
  278. self.graph.edge(cur_item, blob.my_hash)
  279. # if object is tree
  280. if item.mode == b'40000':
  281. local_tree = None
  282. for i in tree.trees:
  283. if i.my_hash == item.sha:
  284. local_tree = i
  285. break
  286. if local_tree:
  287. self.parse_tree(local_tree, tree.my_hash, item.path.decode('utf-8'))
  288. def parse_deps(self):
  289. """
  290. Install dependencies for commits and start drawing the graph for trees
  291. """
  292. self.graph.attr('node', color='black', style="")
  293. for commit in self.resolver.commits:
  294. commit_msg = commit.data[b''].decode('utf-8')
  295. commit_sha = commit.my_hash
  296. tree = commit.tree
  297. if b'parent' in commit.data.keys():
  298. if isinstance(commit.data[b'parent'], list):
  299. for j in commit.data[b'parent']:
  300. parent = self.get_commit_by_sha(j.decode('utf-8'))
  301. if parent:
  302. self.graph.edge(parent.my_hash, commit_sha)
  303. else:
  304. parent = self.get_commit_by_sha(commit.data[b'parent'].decode('utf-8'))
  305. if parent:
  306. self.graph.edge(parent.my_hash, commit_sha)
  307. self.parse_tree(tree, commit_sha, commit_msg + '`s tree')
  308. def get_commit_by_sha(self, sha: str):
  309. """
  310. Find a commit by hash
  311. """
  312. for i in self.resolver.commits:
  313. if i.my_hash == sha:
  314. return i
  315. @staticmethod
  316. def need_to_draw(name: str):
  317. """
  318. Do we need to draw the file on the diagram?
  319. """
  320. l = ['.txt', '.java', '.cpp', '.html', '.js']
  321. for i in l:
  322. if i in name:
  323. return True
  324. return False
  325. def get_arguments():
  326. """
  327. Parse arguments from cmd
  328. """
  329. parser = argparse.ArgumentParser()
  330. parser.add_argument("-p", "--path", dest="path", help="Enter path to .git folder")
  331. parser.add_argument('--no-input', dest='no_console', action='store_false')
  332. options = parser.parse_args()
  333. return options
  334. if __name__ == '__main__':
  335. # get path to .git from args
  336. path = get_arguments().path
  337. if path is None and get_arguments().no_console:
  338. path = str(input("Enter path to .git folder:"))
  339. elif path is None:
  340. path = '.'
  341. absolutePath = os.path.abspath('')
  342. # read the objects and return to the initial folder
  343. a = Reader(path)
  344. os.chdir(absolutePath)
  345. # identify the dependencies, and draw the graph
  346. if len(a.objects) > 0:
  347. b = DependenciesResolver(a.objects)
  348. m = MakeGraph(b)
  349. # output a text representation to the console and render the image
  350. print(m.graph.source)
  351. m.graph.render()