bmt.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561
  1. // Copyright 2017 The go-ethereum Authors
  2. // This file is part of the go-ethereum library.
  3. //
  4. // The go-ethereum library is free software: you can redistribute it and/or modify
  5. // it under the terms of the GNU Lesser General Public License as published by
  6. // the Free Software Foundation, either version 3 of the License, or
  7. // (at your option) any later version.
  8. //
  9. // The go-ethereum library is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU Lesser General Public License for more details.
  13. //
  14. // You should have received a copy of the GNU Lesser General Public License
  15. // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
  16. // Package bmt provides a binary merkle tree implementation
  17. package bmt
  18. import (
  19. "fmt"
  20. "hash"
  21. "io"
  22. "strings"
  23. "sync"
  24. "sync/atomic"
  25. )
  26. /*
  27. Binary Merkle Tree Hash is a hash function over arbitrary datachunks of limited size
  28. It is defined as the root hash of the binary merkle tree built over fixed size segments
  29. of the underlying chunk using any base hash function (e.g keccak 256 SHA3)
  30. It is used as the chunk hash function in swarm which in turn is the basis for the
  31. 128 branching swarm hash http://swarm-guide.readthedocs.io/en/latest/architecture.html#swarm-hash
  32. The BMT is optimal for providing compact inclusion proofs, i.e. prove that a
  33. segment is a substring of a chunk starting at a particular offset
  34. The size of the underlying segments is fixed at 32 bytes (called the resolution
  35. of the BMT hash), the EVM word size to optimize for on-chain BMT verification
  36. as well as the hash size optimal for inclusion proofs in the merkle tree of the swarm hash.
  37. Two implementations are provided:
  38. * RefHasher is optimized for code simplicity and meant as a reference implementation
  39. * Hasher is optimized for speed taking advantage of concurrency with minimalistic
  40. control structure to coordinate the concurrent routines
  41. It implements the ChunkHash interface as well as the go standard hash.Hash interface
  42. */
  43. const (
  44. // DefaultSegmentCount is the maximum number of segments of the underlying chunk
  45. DefaultSegmentCount = 128 // Should be equal to storage.DefaultBranches
  46. // DefaultPoolSize is the maximum number of bmt trees used by the hashers, i.e,
  47. // the maximum number of concurrent BMT hashing operations performed by the same hasher
  48. DefaultPoolSize = 8
  49. )
  50. // BaseHasher is a hash.Hash constructor function used for the base hash of the BMT.
  51. type BaseHasher func() hash.Hash
  52. // Hasher a reusable hasher for fixed maximum size chunks representing a BMT
  53. // implements the hash.Hash interface
  54. // reuse pool of Tree-s for amortised memory allocation and resource control
  55. // supports order-agnostic concurrent segment writes
  56. // as well as sequential read and write
  57. // can not be called concurrently on more than one chunk
  58. // can be further appended after Sum
  59. // Reset gives back the Tree to the pool and guaranteed to leave
  60. // the tree and itself in a state reusable for hashing a new chunk
  61. type Hasher struct {
  62. pool *TreePool // BMT resource pool
  63. bmt *Tree // prebuilt BMT resource for flowcontrol and proofs
  64. blocksize int // segment size (size of hash) also for hash.Hash
  65. count int // segment count
  66. size int // for hash.Hash same as hashsize
  67. cur int // cursor position for rightmost currently open chunk
  68. segment []byte // the rightmost open segment (not complete)
  69. depth int // index of last level
  70. result chan []byte // result channel
  71. hash []byte // to record the result
  72. max int32 // max segments for SegmentWriter interface
  73. blockLength []byte // The block length that needes to be added in Sum
  74. }
  75. // New creates a reusable Hasher
  76. // implements the hash.Hash interface
  77. // pulls a new Tree from a resource pool for hashing each chunk
  78. func New(p *TreePool) *Hasher {
  79. return &Hasher{
  80. pool: p,
  81. depth: depth(p.SegmentCount),
  82. size: p.SegmentSize,
  83. blocksize: p.SegmentSize,
  84. count: p.SegmentCount,
  85. result: make(chan []byte),
  86. }
  87. }
  88. // Node is a reuseable segment hasher representing a node in a BMT
  89. // it allows for continued writes after a Sum
  90. // and is left in completely reusable state after Reset
  91. type Node struct {
  92. level, index int // position of node for information/logging only
  93. initial bool // first and last node
  94. root bool // whether the node is root to a smaller BMT
  95. isLeft bool // whether it is left side of the parent double segment
  96. unbalanced bool // indicates if a node has only the left segment
  97. parent *Node // BMT connections
  98. state int32 // atomic increment impl concurrent boolean toggle
  99. left, right []byte
  100. }
  101. // NewNode constructor for segment hasher nodes in the BMT
  102. func NewNode(level, index int, parent *Node) *Node {
  103. return &Node{
  104. parent: parent,
  105. level: level,
  106. index: index,
  107. initial: index == 0,
  108. isLeft: index%2 == 0,
  109. }
  110. }
  111. // TreePool provides a pool of Trees used as resources by Hasher
  112. // a Tree popped from the pool is guaranteed to have clean state
  113. // for hashing a new chunk
  114. // Hasher Reset releases the Tree to the pool
  115. type TreePool struct {
  116. lock sync.Mutex
  117. c chan *Tree
  118. hasher BaseHasher
  119. SegmentSize int
  120. SegmentCount int
  121. Capacity int
  122. count int
  123. }
  124. // NewTreePool creates a Tree pool with hasher, segment size, segment count and capacity
  125. // on GetTree it reuses free Trees or creates a new one if size is not reached
  126. func NewTreePool(hasher BaseHasher, segmentCount, capacity int) *TreePool {
  127. return &TreePool{
  128. c: make(chan *Tree, capacity),
  129. hasher: hasher,
  130. SegmentSize: hasher().Size(),
  131. SegmentCount: segmentCount,
  132. Capacity: capacity,
  133. }
  134. }
  135. // Drain drains the pool until it has no more than n resources
  136. func (p *TreePool) Drain(n int) {
  137. p.lock.Lock()
  138. defer p.lock.Unlock()
  139. for len(p.c) > n {
  140. <-p.c
  141. p.count--
  142. }
  143. }
  144. // Reserve is blocking until it returns an available Tree
  145. // it reuses free Trees or creates a new one if size is not reached
  146. func (p *TreePool) Reserve() *Tree {
  147. p.lock.Lock()
  148. defer p.lock.Unlock()
  149. var t *Tree
  150. if p.count == p.Capacity {
  151. return <-p.c
  152. }
  153. select {
  154. case t = <-p.c:
  155. default:
  156. t = NewTree(p.hasher, p.SegmentSize, p.SegmentCount)
  157. p.count++
  158. }
  159. return t
  160. }
  161. // Release gives back a Tree to the pool.
  162. // This Tree is guaranteed to be in reusable state
  163. // does not need locking
  164. func (p *TreePool) Release(t *Tree) {
  165. p.c <- t // can never fail but...
  166. }
  167. // Tree is a reusable control structure representing a BMT
  168. // organised in a binary tree
  169. // Hasher uses a TreePool to pick one for each chunk hash
  170. // the Tree is 'locked' while not in the pool
  171. type Tree struct {
  172. leaves []*Node
  173. }
  174. // Draw draws the BMT (badly)
  175. func (t *Tree) Draw(hash []byte, d int) string {
  176. var left, right []string
  177. var anc []*Node
  178. for i, n := range t.leaves {
  179. left = append(left, fmt.Sprintf("%v", hashstr(n.left)))
  180. if i%2 == 0 {
  181. anc = append(anc, n.parent)
  182. }
  183. right = append(right, fmt.Sprintf("%v", hashstr(n.right)))
  184. }
  185. anc = t.leaves
  186. var hashes [][]string
  187. for l := 0; len(anc) > 0; l++ {
  188. var nodes []*Node
  189. hash := []string{""}
  190. for i, n := range anc {
  191. hash = append(hash, fmt.Sprintf("%v|%v", hashstr(n.left), hashstr(n.right)))
  192. if i%2 == 0 && n.parent != nil {
  193. nodes = append(nodes, n.parent)
  194. }
  195. }
  196. hash = append(hash, "")
  197. hashes = append(hashes, hash)
  198. anc = nodes
  199. }
  200. hashes = append(hashes, []string{"", fmt.Sprintf("%v", hashstr(hash)), ""})
  201. total := 60
  202. del := " "
  203. var rows []string
  204. for i := len(hashes) - 1; i >= 0; i-- {
  205. var textlen int
  206. hash := hashes[i]
  207. for _, s := range hash {
  208. textlen += len(s)
  209. }
  210. if total < textlen {
  211. total = textlen + len(hash)
  212. }
  213. delsize := (total - textlen) / (len(hash) - 1)
  214. if delsize > len(del) {
  215. delsize = len(del)
  216. }
  217. row := fmt.Sprintf("%v: %v", len(hashes)-i-1, strings.Join(hash, del[:delsize]))
  218. rows = append(rows, row)
  219. }
  220. rows = append(rows, strings.Join(left, " "))
  221. rows = append(rows, strings.Join(right, " "))
  222. return strings.Join(rows, "\n") + "\n"
  223. }
  224. // NewTree initialises the Tree by building up the nodes of a BMT
  225. // segment size is stipulated to be the size of the hash
  226. // segmentCount needs to be positive integer and does not need to be
  227. // a power of two and can even be an odd number
  228. // segmentSize * segmentCount determines the maximum chunk size
  229. // hashed using the tree
  230. func NewTree(hasher BaseHasher, segmentSize, segmentCount int) *Tree {
  231. n := NewNode(0, 0, nil)
  232. n.root = true
  233. prevlevel := []*Node{n}
  234. // iterate over levels and creates 2^level nodes
  235. level := 1
  236. count := 2
  237. for d := 1; d <= depth(segmentCount); d++ {
  238. nodes := make([]*Node, count)
  239. for i := 0; i < len(nodes); i++ {
  240. parent := prevlevel[i/2]
  241. t := NewNode(level, i, parent)
  242. nodes[i] = t
  243. }
  244. prevlevel = nodes
  245. level++
  246. count *= 2
  247. }
  248. // the datanode level is the nodes on the last level where
  249. return &Tree{
  250. leaves: prevlevel,
  251. }
  252. }
  253. // methods needed by hash.Hash
  254. // Size returns the size
  255. func (h *Hasher) Size() int {
  256. return h.size
  257. }
  258. // BlockSize returns the block size
  259. func (h *Hasher) BlockSize() int {
  260. return h.blocksize
  261. }
  262. // Sum returns the hash of the buffer
  263. // hash.Hash interface Sum method appends the byte slice to the underlying
  264. // data before it calculates and returns the hash of the chunk
  265. func (h *Hasher) Sum(b []byte) (r []byte) {
  266. t := h.bmt
  267. i := h.cur
  268. n := t.leaves[i]
  269. j := i
  270. // must run strictly before all nodes calculate
  271. // datanodes are guaranteed to have a parent
  272. if len(h.segment) > h.size && i > 0 && n.parent != nil {
  273. n = n.parent
  274. } else {
  275. i *= 2
  276. }
  277. d := h.finalise(n, i)
  278. h.writeSegment(j, h.segment, d)
  279. c := <-h.result
  280. h.releaseTree()
  281. // sha3(length + BMT(pure_chunk))
  282. if h.blockLength == nil {
  283. return c
  284. }
  285. res := h.pool.hasher()
  286. res.Reset()
  287. res.Write(h.blockLength)
  288. res.Write(c)
  289. return res.Sum(nil)
  290. }
  291. // Hasher implements the SwarmHash interface
  292. // Hash waits for the hasher result and returns it
  293. // caller must call this on a BMT Hasher being written to
  294. func (h *Hasher) Hash() []byte {
  295. return <-h.result
  296. }
  297. // Hasher implements the io.Writer interface
  298. // Write fills the buffer to hash
  299. // with every full segment complete launches a hasher go routine
  300. // that shoots up the BMT
  301. func (h *Hasher) Write(b []byte) (int, error) {
  302. l := len(b)
  303. if l <= 0 {
  304. return 0, nil
  305. }
  306. s := h.segment
  307. i := h.cur
  308. count := (h.count + 1) / 2
  309. need := h.count*h.size - h.cur*2*h.size
  310. size := h.size
  311. if need > size {
  312. size *= 2
  313. }
  314. if l < need {
  315. need = l
  316. }
  317. // calculate missing bit to complete current open segment
  318. rest := size - len(s)
  319. if need < rest {
  320. rest = need
  321. }
  322. s = append(s, b[:rest]...)
  323. need -= rest
  324. // read full segments and the last possibly partial segment
  325. for need > 0 && i < count-1 {
  326. // push all finished chunks we read
  327. h.writeSegment(i, s, h.depth)
  328. need -= size
  329. if need < 0 {
  330. size += need
  331. }
  332. s = b[rest : rest+size]
  333. rest += size
  334. i++
  335. }
  336. h.segment = s
  337. h.cur = i
  338. // otherwise, we can assume len(s) == 0, so all buffer is read and chunk is not yet full
  339. return l, nil
  340. }
  341. // Hasher implements the io.ReaderFrom interface
  342. // ReadFrom reads from io.Reader and appends to the data to hash using Write
  343. // it reads so that chunk to hash is maximum length or reader reaches EOF
  344. // caller must Reset the hasher prior to call
  345. func (h *Hasher) ReadFrom(r io.Reader) (m int64, err error) {
  346. bufsize := h.size*h.count - h.size*h.cur - len(h.segment)
  347. buf := make([]byte, bufsize)
  348. var read int
  349. for {
  350. var n int
  351. n, err = r.Read(buf)
  352. read += n
  353. if err == io.EOF || read == len(buf) {
  354. hash := h.Sum(buf[:n])
  355. if read == len(buf) {
  356. err = NewEOC(hash)
  357. }
  358. break
  359. }
  360. if err != nil {
  361. break
  362. }
  363. n, err = h.Write(buf[:n])
  364. if err != nil {
  365. break
  366. }
  367. }
  368. return int64(read), err
  369. }
  370. // Reset needs to be called before writing to the hasher
  371. func (h *Hasher) Reset() {
  372. h.getTree()
  373. h.blockLength = nil
  374. }
  375. // Hasher implements the SwarmHash interface
  376. // ResetWithLength needs to be called before writing to the hasher
  377. // the argument is supposed to be the byte slice binary representation of
  378. // the length of the data subsumed under the hash
  379. func (h *Hasher) ResetWithLength(l []byte) {
  380. h.Reset()
  381. h.blockLength = l
  382. }
  383. // Release gives back the Tree to the pool whereby it unlocks
  384. // it resets tree, segment and index
  385. func (h *Hasher) releaseTree() {
  386. if h.bmt != nil {
  387. n := h.bmt.leaves[h.cur]
  388. for ; n != nil; n = n.parent {
  389. n.unbalanced = false
  390. if n.parent != nil {
  391. n.root = false
  392. }
  393. }
  394. h.pool.Release(h.bmt)
  395. h.bmt = nil
  396. }
  397. h.cur = 0
  398. h.segment = nil
  399. }
  400. func (h *Hasher) writeSegment(i int, s []byte, d int) {
  401. hash := h.pool.hasher()
  402. n := h.bmt.leaves[i]
  403. if len(s) > h.size && n.parent != nil {
  404. go func() {
  405. hash.Reset()
  406. hash.Write(s)
  407. s = hash.Sum(nil)
  408. if n.root {
  409. h.result <- s
  410. return
  411. }
  412. h.run(n.parent, hash, d, n.index, s)
  413. }()
  414. return
  415. }
  416. go h.run(n, hash, d, i*2, s)
  417. }
  418. func (h *Hasher) run(n *Node, hash hash.Hash, d int, i int, s []byte) {
  419. isLeft := i%2 == 0
  420. for {
  421. if isLeft {
  422. n.left = s
  423. } else {
  424. n.right = s
  425. }
  426. if !n.unbalanced && n.toggle() {
  427. return
  428. }
  429. if !n.unbalanced || !isLeft || i == 0 && d == 0 {
  430. hash.Reset()
  431. hash.Write(n.left)
  432. hash.Write(n.right)
  433. s = hash.Sum(nil)
  434. } else {
  435. s = append(n.left, n.right...)
  436. }
  437. h.hash = s
  438. if n.root {
  439. h.result <- s
  440. return
  441. }
  442. isLeft = n.isLeft
  443. n = n.parent
  444. i++
  445. }
  446. }
  447. // getTree obtains a BMT resource by reserving one from the pool
  448. func (h *Hasher) getTree() *Tree {
  449. if h.bmt != nil {
  450. return h.bmt
  451. }
  452. t := h.pool.Reserve()
  453. h.bmt = t
  454. return t
  455. }
  456. // atomic bool toggle implementing a concurrent reusable 2-state object
  457. // atomic addint with %2 implements atomic bool toggle
  458. // it returns true if the toggler just put it in the active/waiting state
  459. func (n *Node) toggle() bool {
  460. return atomic.AddInt32(&n.state, 1)%2 == 1
  461. }
  462. func hashstr(b []byte) string {
  463. end := len(b)
  464. if end > 4 {
  465. end = 4
  466. }
  467. return fmt.Sprintf("%x", b[:end])
  468. }
  469. func depth(n int) (d int) {
  470. for l := (n - 1) / 2; l > 0; l /= 2 {
  471. d++
  472. }
  473. return d
  474. }
  475. // finalise is following the zigzags on the tree belonging
  476. // to the final datasegment
  477. func (h *Hasher) finalise(n *Node, i int) (d int) {
  478. isLeft := i%2 == 0
  479. for {
  480. // when the final segment's path is going via left segments
  481. // the incoming data is pushed to the parent upon pulling the left
  482. // we do not need toggle the state since this condition is
  483. // detectable
  484. n.unbalanced = isLeft
  485. n.right = nil
  486. if n.initial {
  487. n.root = true
  488. return d
  489. }
  490. isLeft = n.isLeft
  491. n = n.parent
  492. d++
  493. }
  494. }
  495. // EOC (end of chunk) implements the error interface
  496. type EOC struct {
  497. Hash []byte // read the hash of the chunk off the error
  498. }
  499. // Error returns the error string
  500. func (e *EOC) Error() string {
  501. return fmt.Sprintf("hasher limit reached, chunk hash: %x", e.Hash)
  502. }
  503. // NewEOC creates new end of chunk error with the hash
  504. func NewEOC(hash []byte) *EOC {
  505. return &EOC{hash}
  506. }