algorithm.c 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992
  1. //go:build exclude_me
  2. /*
  3. * algorithm.c
  4. * Copyright (C) 2023 Kovid Goyal <kovid at kovidgoyal.net>
  5. *
  6. * Distributed under terms of the GPL3 license.
  7. */
  8. #include "data-types.h"
  9. #include "binary.h"
  10. #include <math.h>
  11. #include <xxhash.h>
  12. static PyObject *RsyncError = NULL;
  13. static const size_t default_block_size = 6 * 1024;
  14. static const size_t signature_block_size = 20;
  15. void log_error(const char *fmt, ...) { va_list args; va_start(args, fmt); vfprintf(stderr, fmt, args); va_end(args); }
  16. // hashers {{{
  17. typedef void*(*new_hash_t)(void);
  18. typedef void(*delete_hash_t)(void*);
  19. typedef bool(*reset_hash_t)(void*);
  20. typedef bool(*update_hash_t)(void*, const void *input, size_t length);
  21. typedef void(*digest_hash_t)(const void*, void *output);
  22. typedef uint64_t(*digest_hash64_t)(const void*);
  23. typedef uint64_t(*oneshot_hash64_t)(const void*, size_t);
  24. typedef struct hasher_t {
  25. size_t hash_size, block_size;
  26. void *state;
  27. new_hash_t new;
  28. delete_hash_t delete;
  29. reset_hash_t reset;
  30. update_hash_t update;
  31. digest_hash_t digest;
  32. digest_hash64_t digest64;
  33. oneshot_hash64_t oneshot64;
  34. } hasher_t;
  35. static void xxh64_delete(void* s) { XXH3_freeState(s); }
  36. static bool xxh64_reset(void* s) { return XXH3_64bits_reset(s) == XXH_OK; }
  37. static void* xxh64_create(void) { void *ans = XXH3_createState(); if (ans != NULL) xxh64_reset(ans); return ans; }
  38. static bool xxh64_update(void* s, const void *input, size_t length) { return XXH3_64bits_update(s, input, length) == XXH_OK; }
  39. static uint64_t xxh64_digest64(const void* s) { return XXH3_64bits_digest(s); }
  40. static uint64_t xxh64_oneshot64(const void* s, size_t len) { return XXH3_64bits(s, len); }
  41. static void xxh64_digest(const void* s, void *output) {
  42. XXH64_hash_t ans = XXH3_64bits_digest(s);
  43. XXH64_canonical_t c;
  44. XXH64_canonicalFromHash(&c, ans);
  45. memcpy(output, c.digest, sizeof(c.digest));
  46. }
  47. static hasher_t
  48. xxh64_hasher(void) {
  49. hasher_t ans = {
  50. .hash_size=sizeof(XXH64_hash_t), .block_size = 64,
  51. .new=xxh64_create, .delete=xxh64_delete, .reset=xxh64_reset, .update=xxh64_update, .digest=xxh64_digest,
  52. .digest64=xxh64_digest64, .oneshot64=xxh64_oneshot64
  53. };
  54. return ans;
  55. }
  56. static bool xxh128_reset(void* s) { return XXH3_128bits_reset(s) == XXH_OK; }
  57. static void* xxh128_create(void) { void *ans = XXH3_createState(); if (ans != NULL) xxh128_reset(ans); return ans; }
  58. static bool xxh128_update(void* s, const void *input, size_t length) { return XXH3_128bits_update(s, input, length) == XXH_OK; }
  59. static void xxh128_digest(const void* s, void *output) {
  60. XXH128_hash_t ans = XXH3_128bits_digest(s);
  61. XXH128_canonical_t c;
  62. XXH128_canonicalFromHash(&c, ans);
  63. memcpy(output, c.digest, sizeof(c.digest));
  64. }
  65. static hasher_t
  66. xxh128_hasher(void) {
  67. hasher_t ans = {
  68. .hash_size=sizeof(XXH128_hash_t), .block_size = 64,
  69. .new=xxh128_create, .delete=xxh64_delete, .reset=xxh128_reset, .update=xxh128_update, .digest=xxh128_digest,
  70. };
  71. return ans;
  72. }
  73. typedef hasher_t(*hasher_constructor_t)(void);
  74. // }}}
  75. typedef struct Rsync {
  76. size_t block_size;
  77. hasher_constructor_t hasher_constructor, checksummer_constructor;
  78. hasher_t hasher, checksummer;
  79. size_t buffer_cap, buffer_sz;
  80. } Rsync;
  81. static void
  82. free_rsync(Rsync* r) {
  83. if (r->hasher.state) { r->hasher.delete(r->hasher.state); r->hasher.state = NULL; }
  84. if (r->checksummer.state) { r->checksummer.delete(r->checksummer.state); r->checksummer.state = NULL; }
  85. }
  86. static const char*
  87. init_rsync(Rsync *ans, size_t block_size, int strong_hash_type, int checksum_type) {
  88. memset(ans, 0, sizeof(*ans));
  89. ans->block_size = block_size;
  90. if (strong_hash_type == 0) ans->hasher_constructor = xxh64_hasher;
  91. if (checksum_type == 0) ans->checksummer_constructor = xxh128_hasher;
  92. if (ans->hasher_constructor == NULL) { free_rsync(ans); return "Unknown strong hash type"; }
  93. if (ans->checksummer_constructor == NULL) { free_rsync(ans); return "Unknown checksum type"; }
  94. ans->hasher = ans->hasher_constructor();
  95. ans->checksummer = ans->checksummer_constructor();
  96. ans->hasher.state = ans->hasher.new();
  97. if (ans->hasher.state == NULL) { free_rsync(ans); return "Out of memory"; }
  98. ans->checksummer.state = ans->checksummer.new();
  99. if (ans->checksummer.state == NULL) { free_rsync(ans); return "Out of memory"; }
  100. return NULL;
  101. }
  102. typedef struct rolling_checksum {
  103. uint32_t alpha, beta, val, l, first_byte_of_previous_window;
  104. } rolling_checksum;
  105. static const uint32_t _M = (1 << 16);
  106. static uint32_t
  107. rolling_checksum_full(rolling_checksum *self, uint8_t *data, uint32_t len) {
  108. uint32_t alpha = 0, beta = 0;
  109. self->l = len;
  110. for (uint32_t i = 0; i < len; i++) {
  111. alpha += data[i];
  112. beta += (self->l - i) * data[i];
  113. }
  114. self->first_byte_of_previous_window = data[0];
  115. self->alpha = alpha % _M;
  116. self->beta = beta % _M;
  117. self->val = self->alpha + _M*self->beta;
  118. return self->val;
  119. }
  120. inline static void
  121. rolling_checksum_add_one_byte(rolling_checksum *self, uint8_t first_byte, uint8_t last_byte) {
  122. self->alpha = (self->alpha - self->first_byte_of_previous_window + last_byte) % _M;
  123. self->beta = (self->beta - (self->l)*self->first_byte_of_previous_window + self->alpha) % _M;
  124. self->val = self->alpha + _M*self->beta;
  125. self->first_byte_of_previous_window = first_byte;
  126. }
  127. // Python interface {{{
  128. typedef struct buffer {
  129. uint8_t *data;
  130. size_t len, cap;
  131. } buffer;
  132. static bool
  133. ensure_space(buffer *b, size_t amt) {
  134. const size_t len = b->len;
  135. if (amt > 0 && b->cap < len + amt) {
  136. size_t newcap = MAX(b->cap * 2, len + (amt * 2));
  137. b->data = realloc(b->data, newcap);
  138. if (b->data == NULL) { PyErr_NoMemory(); return false; }
  139. b->cap = newcap;
  140. }
  141. return true;
  142. }
  143. static bool
  144. write_to_buffer(buffer *b, void *data, size_t len) {
  145. if (!ensure_space(b, len)) return false;
  146. memcpy(b->data + b->len, data, len);
  147. b->len += len;
  148. return true;
  149. }
  150. static void
  151. shift_left(buffer *b, size_t amt) {
  152. if (amt > b->len) amt = b->len;
  153. if (amt > 0) {
  154. b->len -= amt;
  155. memmove(b->data, b->data + amt, b->len);
  156. }
  157. }
  158. // Patcher {{{
  159. typedef struct {
  160. PyObject_HEAD
  161. rolling_checksum rc;
  162. uint64_t signature_idx;
  163. size_t total_data_in_delta;
  164. Rsync rsync;
  165. buffer buf, block_buf;
  166. PyObject *block_buf_view;
  167. bool checksum_done;
  168. } Patcher;
  169. static int
  170. Patcher_init(PyObject *s, PyObject *args, PyObject *kwds) {
  171. Patcher *self = (Patcher*)s;
  172. static char *kwlist[] = {"expected_input_size", NULL};
  173. unsigned long long expected_input_size = 0;
  174. if (!PyArg_ParseTupleAndKeywords(args, kwds, "|K", kwlist, &expected_input_size)) return -1;
  175. self->rsync.block_size = default_block_size;
  176. if (expected_input_size > 0) {
  177. self->rsync.block_size = (size_t)round(sqrt((double)expected_input_size));
  178. }
  179. const char *err = init_rsync(&self->rsync, self->rsync.block_size, 0, 0);
  180. if (err != NULL) { PyErr_SetString(RsyncError, err); return -1; }
  181. self->block_buf.cap = self->rsync.block_size;
  182. self->block_buf.data = malloc(self->rsync.block_size);
  183. if (self->block_buf.data == NULL) { PyErr_NoMemory(); return -1; }
  184. if (!(self->block_buf_view = PyMemoryView_FromMemory((char*)self->block_buf.data, self->rsync.block_size, PyBUF_WRITE))) return -1;
  185. return 0;
  186. }
  187. static void
  188. Patcher_dealloc(PyObject *self) {
  189. Patcher *p = (Patcher*)self;
  190. if (p->buf.data) free(p->buf.data);
  191. Py_CLEAR(p->block_buf_view);
  192. if (p->block_buf.data) free(p->block_buf.data);
  193. free_rsync(&p->rsync);
  194. Py_TYPE(self)->tp_free(self);
  195. }
  196. static PyObject*
  197. signature_header(Patcher *self, PyObject *a2) {
  198. RAII_PY_BUFFER(dest);
  199. if (PyObject_GetBuffer(a2, &dest, PyBUF_WRITEABLE) == -1) return NULL;
  200. static const ssize_t header_size = 12;
  201. if (dest.len < header_size) {
  202. PyErr_SetString(RsyncError, "Output buffer is too small");
  203. }
  204. uint8_t *o = dest.buf;
  205. le16enc(o, 0); // version
  206. le16enc(o + 2, 0); // checksum type
  207. le16enc(o + 4, 0); // strong hash type
  208. le16enc(o + 6, 0); // weak hash type
  209. le32enc(o + 8, self->rsync.block_size); // block size
  210. return PyLong_FromSsize_t(header_size);
  211. }
  212. static PyObject*
  213. sign_block(Patcher *self, PyObject *args) {
  214. PyObject *a1, *a2;
  215. if (!PyArg_ParseTuple(args, "OO", &a1, &a2)) return NULL;
  216. RAII_PY_BUFFER(src); RAII_PY_BUFFER(dest);
  217. if (PyObject_GetBuffer(a1, &src, PyBUF_SIMPLE) == -1) return NULL;
  218. if (PyObject_GetBuffer(a2, &dest, PyBUF_WRITEABLE) == -1) return NULL;
  219. if (dest.len < (ssize_t)signature_block_size) {
  220. PyErr_SetString(RsyncError, "Output buffer is too small");
  221. }
  222. self->rsync.hasher.reset(self->rsync.hasher.state);
  223. if (!self->rsync.hasher.update(self->rsync.hasher.state, src.buf, src.len)) { PyErr_SetString(PyExc_ValueError, "String hashing failed"); return NULL; }
  224. uint64_t strong_hash = self->rsync.hasher.oneshot64(src.buf, src.len);
  225. uint32_t weak_hash = rolling_checksum_full(&self->rc, src.buf, src.len);
  226. uint8_t *o = dest.buf;
  227. le64enc(o, self->signature_idx++);
  228. le32enc(o + 8, weak_hash);
  229. le64enc(o + 12, strong_hash);
  230. return PyLong_FromSize_t(signature_block_size);
  231. }
  232. typedef enum { OpBlock, OpData, OpHash, OpBlockRange } OpType;
  233. typedef struct Operation {
  234. OpType type;
  235. uint64_t block_index, block_index_end;
  236. struct { uint8_t *buf; size_t len; } data;
  237. } Operation;
  238. static size_t
  239. unserialize_op(uint8_t *data, size_t len, Operation *op) {
  240. size_t consumed = 0;
  241. switch ((OpType)(data[0])) {
  242. case OpBlock:
  243. consumed = 9;
  244. if (len < consumed) return 0;
  245. op->block_index = le64dec(data + 1);
  246. break;
  247. case OpBlockRange:
  248. consumed = 13;
  249. if (len < consumed) return 0;
  250. op->block_index = le64dec(data + 1);
  251. op->block_index_end = op->block_index + le32dec(data + 9);
  252. break;
  253. case OpHash:
  254. consumed = 3;
  255. if (len < consumed) return 0;
  256. op->data.len = le16dec(data + 1);
  257. if (len < consumed + op->data.len) return 0;
  258. op->data.buf = data + 3;
  259. consumed += op->data.len;
  260. break;
  261. case OpData:
  262. consumed = 5;
  263. if (len < consumed) return 0;
  264. op->data.len = le32dec(data + 1);
  265. if (len < consumed + op->data.len) return 0;
  266. op->data.buf = data + 5;
  267. consumed += op->data.len;
  268. break;
  269. }
  270. if (consumed) op->type = data[0];
  271. return consumed;
  272. }
  273. static bool
  274. write_block(Patcher *self, uint64_t block_index, PyObject *read, PyObject *write) {
  275. RAII_PyObject(pos, PyLong_FromUnsignedLongLong((unsigned long long)(self->rsync.block_size * block_index)));
  276. if (!pos) return false;
  277. RAII_PyObject(ret, PyObject_CallFunctionObjArgs(read, pos, self->block_buf_view, NULL));
  278. if (ret == NULL) return false;
  279. if (!PyLong_Check(ret)) { PyErr_SetString(PyExc_TypeError, "read callback function did not return an integer"); return false; }
  280. size_t n = PyLong_AsSize_t(ret);
  281. self->rsync.checksummer.update(self->rsync.checksummer.state, self->block_buf.data, n);
  282. RAII_PyObject(view, PyMemoryView_FromMemory((char*)self->block_buf.data, n, PyBUF_READ));
  283. if (!view) return false;
  284. RAII_PyObject(wret, PyObject_CallFunctionObjArgs(write, view, NULL));
  285. if (wret == NULL) return false;
  286. return true;
  287. }
  288. static void
  289. bytes_as_hex(const uint8_t *bytes, const size_t len, char *ans) {
  290. static const char * hex = "0123456789abcdef";
  291. char *pout = ans; const uint8_t *pin = bytes;
  292. for (; pin < bytes + len; pin++) {
  293. *pout++ = hex[(*pin>>4) & 0xF];
  294. *pout++ = hex[ *pin & 0xF];
  295. }
  296. *pout++ = 0;
  297. }
  298. static bool
  299. apply_op(Patcher *self, Operation op, PyObject *read, PyObject *write) {
  300. switch (op.type) {
  301. case OpBlock:
  302. return write_block(self, op.block_index, read, write);
  303. case OpBlockRange:
  304. for (size_t i = op.block_index; i <= op.block_index_end; i++) {
  305. if (!write_block(self, i, read, write)) return false;
  306. }
  307. return true;
  308. case OpData: {
  309. self->total_data_in_delta += op.data.len;
  310. self->rsync.checksummer.update(self->rsync.checksummer.state, op.data.buf, op.data.len);
  311. RAII_PyObject(view, PyMemoryView_FromMemory((char*)op.data.buf, op.data.len, PyBUF_READ));
  312. if (!view) return false;
  313. RAII_PyObject(wret, PyObject_CallFunctionObjArgs(write, view, NULL));
  314. if (!wret) return false;
  315. } return true;
  316. case OpHash: {
  317. uint8_t actual[64];
  318. if (op.data.len != self->rsync.checksummer.hash_size) { PyErr_SetString(RsyncError, "checksum digest not the correct size"); return false; }
  319. self->rsync.checksummer.digest(self->rsync.checksummer.state, actual);
  320. if (memcmp(actual, op.data.buf, self->rsync.checksummer.hash_size) != 0) {
  321. char hexdigest[129];
  322. bytes_as_hex(actual, self->rsync.checksummer.hash_size, hexdigest);
  323. RAII_PyObject(h1, PyUnicode_FromStringAndSize(hexdigest, 2*self->rsync.checksummer.hash_size));
  324. bytes_as_hex(op.data.buf, op.data.len, hexdigest);
  325. RAII_PyObject(h2, PyUnicode_FromStringAndSize(hexdigest, 2*self->rsync.checksummer.hash_size));
  326. PyErr_Format(RsyncError, "Failed to verify overall file checksum actual: %S != expected: %S, this usually happens because one of the involved files was altered while the operation was in progress.", h1, h2);
  327. return false;
  328. }
  329. self->checksum_done = true;
  330. } return true;
  331. }
  332. PyErr_SetString(RsyncError, "Unknown operation type");
  333. return false;
  334. }
  335. static PyObject*
  336. apply_delta_data(Patcher *self, PyObject *args) {
  337. PyObject *read, *write;
  338. RAII_PY_BUFFER(data);
  339. if (!PyArg_ParseTuple(args, "y*OO", &data, &read, &write)) return NULL;
  340. if (!write_to_buffer(&self->buf, data.buf, data.len)) return NULL;
  341. size_t pos = 0;
  342. Operation op = {0};
  343. while (pos < self->buf.len) {
  344. size_t consumed = unserialize_op(self->buf.data + pos, self->buf.len - pos, &op);
  345. if (!consumed) { break; }
  346. pos += consumed;
  347. if (!apply_op(self, op, read, write)) break;
  348. }
  349. shift_left(&self->buf, pos);
  350. if (PyErr_Occurred()) return NULL;
  351. Py_RETURN_NONE;
  352. }
  353. static PyObject*
  354. finish_delta_data(Patcher *self, PyObject *args UNUSED) {
  355. if (self->buf.len > 0) { PyErr_Format(RsyncError, "%zu bytes of unused delta data", self->buf.len); return NULL; }
  356. if (!self->checksum_done) { PyErr_SetString(RsyncError, "The checksum was not received at the end of the delta data"); return NULL; }
  357. Py_RETURN_NONE;
  358. }
  359. static PyObject*
  360. Patcher_block_size(Patcher* self, void* closure UNUSED) { return PyLong_FromSize_t(self->rsync.block_size); }
  361. static PyObject*
  362. Patcher_total_data_in_delta(Patcher* self, void* closure UNUSED) { return PyLong_FromSize_t(self->total_data_in_delta); }
  363. PyGetSetDef Patcher_getsets[] = {
  364. {"block_size", (getter)Patcher_block_size, NULL, NULL, NULL},
  365. {"total_data_in_delta", (getter)Patcher_total_data_in_delta, NULL, NULL, NULL},
  366. {NULL}
  367. };
  368. static PyMethodDef Patcher_methods[] = {
  369. METHODB(sign_block, METH_VARARGS),
  370. METHODB(signature_header, METH_O),
  371. METHODB(apply_delta_data, METH_VARARGS),
  372. METHODB(finish_delta_data, METH_NOARGS),
  373. {NULL} /* Sentinel */
  374. };
  375. PyTypeObject Patcher_Type = {
  376. PyVarObject_HEAD_INIT(NULL, 0)
  377. .tp_name = "rsync.Patcher",
  378. .tp_basicsize = sizeof(Patcher),
  379. .tp_dealloc = Patcher_dealloc,
  380. .tp_flags = Py_TPFLAGS_DEFAULT,
  381. .tp_doc = "Patcher",
  382. .tp_methods = Patcher_methods,
  383. .tp_new = PyType_GenericNew,
  384. .tp_init = Patcher_init,
  385. .tp_getset = Patcher_getsets,
  386. };
  387. // }}} Patcher
  388. // Differ {{{
  389. typedef struct Signature { uint64_t index, strong_hash; } Signature;
  390. typedef struct SignatureVal {
  391. Signature sig, *weak_hash_collisions;
  392. size_t len, cap;
  393. } SignatureVal;
  394. #define NAME SignatureMap
  395. #define KEY_TY int
  396. #define VAL_TY SignatureVal
  397. static void free_signature_val(SignatureVal x) { free(x.weak_hash_collisions); }
  398. #define VAL_DTOR_FN free_signature_val
  399. #include "kitty-verstable.h"
  400. typedef struct Differ {
  401. PyObject_HEAD
  402. rolling_checksum rc;
  403. uint64_t signature_idx;
  404. Rsync rsync;
  405. bool signature_header_parsed;
  406. buffer buf;
  407. SignatureMap signature_map;
  408. PyObject *read, *write;
  409. bool written, finished;
  410. struct { size_t pos, sz; } window, data;
  411. Operation pending_op; bool has_pending;
  412. uint8_t checksum[32];
  413. } Differ;
  414. static int
  415. Differ_init(PyObject *s, PyObject *args, PyObject *kwds) {
  416. Differ *self = (Differ*)s;
  417. static char *kwlist[] = {NULL};
  418. if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist)) return -1;
  419. const char *err = init_rsync(&self->rsync, default_block_size, 0, 0);
  420. if (err != NULL) { PyErr_SetString(RsyncError, err); return -1; }
  421. vt_init(&self->signature_map);
  422. return 0;
  423. }
  424. static void
  425. Differ_dealloc(PyObject *self) {
  426. Differ *p = (Differ*)self;
  427. if (p->buf.data) free(p->buf.data);
  428. free_rsync(&p->rsync);
  429. vt_cleanup(&p->signature_map);
  430. Py_TYPE(self)->tp_free(self);
  431. }
  432. static void
  433. parse_signature_header(Differ *self) {
  434. if (self->buf.len < 12) return;
  435. uint8_t *p = self->buf.data;
  436. uint32_t x;
  437. if ((x = le16dec(p)) != 0) {
  438. PyErr_Format(RsyncError, "Invalid version in signature header: %u", x); return;
  439. } p += 2;
  440. if ((x = le16dec(p)) != 0) {
  441. PyErr_Format(RsyncError, "Invalid checksum type in signature header: %u", x); return;
  442. } p += 2;
  443. if ((x = le16dec(p)) != 0) {
  444. PyErr_Format(RsyncError, "Invalid strong hash type in signature header: %u", x); return;
  445. } p += 2;
  446. if ((x = le16dec(p)) != 0) {
  447. PyErr_Format(RsyncError, "Invalid weak hash type in signature header: %u", x); return;
  448. } p += 2;
  449. const char *err = init_rsync(&self->rsync, le32dec(p), 0, 0);
  450. if (err != NULL) { PyErr_SetString(RsyncError, err); return; }
  451. p += 4;
  452. shift_left(&self->buf, p - self->buf.data);
  453. self->signature_header_parsed = true;
  454. }
  455. static bool
  456. add_collision(SignatureVal *sm, Signature s) {
  457. if (sm->cap < sm->len + 1) {
  458. size_t new_cap = MAX(sm->cap * 2, 8u);
  459. sm->weak_hash_collisions = realloc(sm->weak_hash_collisions, new_cap * sizeof(sm->weak_hash_collisions[0]));
  460. if (!sm->weak_hash_collisions) { PyErr_NoMemory(); return false; }
  461. sm->cap = new_cap;
  462. }
  463. sm->weak_hash_collisions[sm->len++] = s;
  464. return true;
  465. }
  466. static size_t
  467. parse_signature_block(Differ *self, uint8_t *data, size_t len) {
  468. if (len < 20) return 0;
  469. int weak_hash = le32dec(data + 8);
  470. SignatureMap_itr i = vt_get(&self->signature_map, weak_hash);
  471. if (vt_is_end(i)) {
  472. SignatureVal s = {0};
  473. s.sig.index = le64dec(data);
  474. s.sig.strong_hash = le64dec(data+12);
  475. vt_insert(&self->signature_map, weak_hash, s);
  476. } else {
  477. if (!add_collision(&i.data->val, (Signature){.index=le64dec(data), .strong_hash=le64dec(data+12)})) return 0;
  478. }
  479. return 20;
  480. }
  481. static PyObject*
  482. add_signature_data(Differ *self, PyObject *args) {
  483. RAII_PY_BUFFER(data);
  484. if (!PyArg_ParseTuple(args, "y*", &data)) return NULL;
  485. if (!write_to_buffer(&self->buf, data.buf, data.len)) return NULL;
  486. if (!self->signature_header_parsed) {
  487. parse_signature_header(self);
  488. if (PyErr_Occurred()) return NULL;
  489. if (!self->signature_header_parsed) { Py_RETURN_NONE; }
  490. }
  491. size_t pos = 0;
  492. while (pos < self->buf.len) {
  493. size_t consumed = parse_signature_block(self, self->buf.data + pos, self->buf.len - pos);
  494. if (!consumed) { break; }
  495. pos += consumed;
  496. }
  497. shift_left(&self->buf, pos);
  498. if (PyErr_Occurred()) return NULL;
  499. Py_RETURN_NONE;
  500. }
  501. static PyObject*
  502. finish_signature_data(Differ *self, PyObject *args UNUSED) {
  503. if (self->buf.len > 0) { PyErr_Format(RsyncError, "%zu bytes of unused signature data", self->buf.len); return NULL; }
  504. self->buf.len = 0;
  505. self->buf.cap = 8 * self->rsync.block_size;
  506. self->buf.data = realloc(self->buf.data, self->buf.cap);
  507. if (!self->buf.data) return PyErr_NoMemory();
  508. Py_RETURN_NONE;
  509. }
  510. static bool
  511. send_op(Differ *self, Operation *op) {
  512. uint8_t metadata[32];
  513. size_t len = 0;
  514. metadata[0] = op->type;
  515. switch (op->type) {
  516. case OpBlock:
  517. le64enc(metadata + 1, op->block_index);
  518. len = 9;
  519. break;
  520. case OpBlockRange:
  521. le64enc(metadata + 1, op->block_index);
  522. le32enc(metadata + 9, op->block_index_end - op->block_index);
  523. len = 13;
  524. break;
  525. case OpHash:
  526. le16enc(metadata + 1, op->data.len);
  527. memcpy(metadata + 3, op->data.buf, op->data.len);
  528. len = 3 + op->data.len;
  529. break;
  530. case OpData:
  531. le32enc(metadata + 1, op->data.len);
  532. len = 5;
  533. break;
  534. }
  535. RAII_PyObject(mv, PyMemoryView_FromMemory((char*)metadata, len, PyBUF_READ));
  536. RAII_PyObject(ret, PyObject_CallFunctionObjArgs(self->write, mv, NULL));
  537. if (ret == NULL) return false;
  538. if (op->type == OpData) {
  539. RAII_PyObject(mv, PyMemoryView_FromMemory((char*)op->data.buf, op->data.len, PyBUF_READ));
  540. RAII_PyObject(ret, PyObject_CallFunctionObjArgs(self->write, mv, NULL));
  541. if (ret == NULL) return false;
  542. }
  543. self->written = true;
  544. return true;
  545. }
  546. static bool
  547. send_pending(Differ *self) {
  548. bool ret = true;
  549. if (self->has_pending) {
  550. ret = send_op(self, &self->pending_op);
  551. self->has_pending = false;
  552. }
  553. return ret;
  554. }
  555. static bool
  556. send_data(Differ *self) {
  557. if (self->data.sz > 0) {
  558. if (!send_pending(self)) return false;
  559. Operation op = {.type=OpData};
  560. op.data.buf = self->buf.data + self->data.pos;
  561. op.data.len = self->data.sz;
  562. self->data.pos += self->data.sz;
  563. self->data.sz = 0;
  564. return send_op(self, &op);
  565. }
  566. return true;
  567. }
  568. static bool
  569. ensure_idx_valid(Differ *self, size_t idx) {
  570. if (idx < self->buf.len) return true;
  571. if (idx >= self->buf.cap) {
  572. // need to wrap the buffer, so send off any data present behind the window
  573. if (!send_data(self)) return false;
  574. // copy the window and any data present after it to the start of the buffer
  575. size_t distance_from_window_pos = idx - self->window.pos;
  576. size_t amt_to_copy = self->buf.len - self->window.pos;
  577. memmove(self->buf.data, self->buf.data + self->window.pos, amt_to_copy);
  578. self->buf.len = amt_to_copy;
  579. self->window.pos = 0;
  580. self->data.pos = 0;
  581. return ensure_idx_valid(self, distance_from_window_pos);
  582. }
  583. RAII_PyObject(mv, PyMemoryView_FromMemory((char*)self->buf.data + self->buf.len, self->buf.cap - self->buf.len, PyBUF_WRITE));
  584. if (!mv) return false;
  585. RAII_PyObject(ret, PyObject_CallFunctionObjArgs(self->read, mv, NULL));
  586. if (!ret) return false;
  587. if (!PyLong_Check(ret)) { PyErr_SetString(PyExc_TypeError, "read callback did not return an integer"); return false; }
  588. size_t n = PyLong_AsSize_t(ret);
  589. self->rsync.checksummer.update(self->rsync.checksummer.state, self->buf.data + self->buf.len, n);
  590. self->buf.len += n;
  591. return self->buf.len > idx;
  592. }
  593. static bool
  594. find_strong_hash(const SignatureVal *sm, uint64_t q, uint64_t *block_index) {
  595. if (sm->sig.strong_hash == q) { *block_index = sm->sig.index; return true; }
  596. for (size_t i = 0; i < sm->len; i++) {
  597. if (sm->weak_hash_collisions[i].strong_hash == q) { *block_index = sm->weak_hash_collisions[i].index; return true; }
  598. }
  599. return false;
  600. }
  601. static bool
  602. enqueue(Differ *self, Operation op) {
  603. switch (op.type) {
  604. case OpBlock:
  605. if (self->has_pending) {
  606. switch (self->pending_op.type) {
  607. case OpBlock:
  608. if (self->pending_op.block_index+1 == op.block_index) {
  609. self->pending_op.type = OpBlockRange;
  610. self->pending_op.block_index_end = op.block_index;
  611. return true;
  612. }
  613. break;
  614. case OpBlockRange:
  615. if (self->pending_op.block_index_end+1 == op.block_index) {
  616. self->pending_op.block_index_end = op.block_index;
  617. return true;
  618. }
  619. case OpHash: case OpData: break;
  620. }
  621. if (!send_pending(self)) return false;
  622. }
  623. self->pending_op = op;
  624. self->has_pending = true;
  625. return true;
  626. case OpHash:
  627. if (!send_pending(self)) return false;
  628. return send_op(self, &op);
  629. case OpBlockRange: case OpData:
  630. PyErr_SetString(RsyncError, "enqueue() must never be called with anything other than OpHash and OpBlock");
  631. return false;
  632. }
  633. return false;
  634. }
  635. static bool
  636. finish_up(Differ *self) {
  637. if (!send_data(self)) return false;
  638. self->data.pos = self->window.pos;
  639. self->data.sz = self->buf.len - self->window.pos;
  640. if (!send_data(self)) return false;
  641. self->rsync.checksummer.digest(self->rsync.checksummer.state, self->checksum);
  642. Operation op = {.type=OpHash};
  643. op.data.buf = self->checksum; op.data.len = self->rsync.checksummer.hash_size;
  644. if (!enqueue(self, op)) return false;
  645. self->finished = true;
  646. return true;
  647. }
  648. static bool
  649. read_next(Differ *self) {
  650. if (self->window.sz > 0) {
  651. if (!ensure_idx_valid(self, self->window.pos + self->window.sz)) {
  652. if (PyErr_Occurred()) return false;
  653. return finish_up(self);
  654. }
  655. self->window.pos++;
  656. self->data.sz++;
  657. rolling_checksum_add_one_byte(&self->rc, self->buf.data[self->window.pos], self->buf.data[self->window.pos + self->window.sz - 1]);
  658. } else {
  659. if (!ensure_idx_valid(self, self->window.pos + self->rsync.block_size - 1)) {
  660. if (PyErr_Occurred()) return false;
  661. return finish_up(self);
  662. }
  663. self->window.sz = self->rsync.block_size;
  664. rolling_checksum_full(&self->rc, self->buf.data + self->window.pos, self->window.sz);
  665. }
  666. int weak_hash = self->rc.val;
  667. uint64_t block_index = 0;
  668. SignatureMap_itr i = vt_get(&self->signature_map, weak_hash);
  669. if (!vt_is_end(i) && find_strong_hash(&i.data->val, self->rsync.hasher.oneshot64(self->buf.data + self->window.pos, self->window.sz), &block_index)) {
  670. if (!send_data(self)) return false;
  671. if (!enqueue(self, (Operation){.type=OpBlock, .block_index=block_index})) return false;
  672. self->window.pos += self->window.sz;
  673. self->data.pos = self->window.pos;
  674. self->window.sz = 0;
  675. }
  676. return true;
  677. }
  678. static PyObject*
  679. next_op(Differ *self, PyObject *args) {
  680. if (!PyArg_ParseTuple(args, "OO", &self->read, &self->write)) return NULL;
  681. self->written = false;
  682. while (!self->written && !self->finished) {
  683. if (!read_next(self)) break;
  684. }
  685. if (self->finished && !PyErr_Occurred()) {
  686. send_pending(self);
  687. }
  688. self->read = NULL; self->write = NULL;
  689. if (PyErr_Occurred()) return NULL;
  690. if (self->finished) { Py_RETURN_FALSE; }
  691. Py_RETURN_TRUE;
  692. }
  693. static PyMethodDef Differ_methods[] = {
  694. METHODB(add_signature_data, METH_VARARGS),
  695. METHODB(finish_signature_data, METH_NOARGS),
  696. METHODB(next_op, METH_VARARGS),
  697. {NULL} /* Sentinel */
  698. };
  699. PyTypeObject Differ_Type = {
  700. PyVarObject_HEAD_INIT(NULL, 0)
  701. .tp_name = "rsync.Differ",
  702. .tp_basicsize = sizeof(Differ),
  703. .tp_dealloc = Differ_dealloc,
  704. .tp_flags = Py_TPFLAGS_DEFAULT,
  705. .tp_doc = "Differ",
  706. .tp_methods = Differ_methods,
  707. .tp_new = PyType_GenericNew,
  708. .tp_init = Differ_init,
  709. };
  710. // }}} Differ
  711. // Hasher {{{
  712. typedef struct {
  713. PyObject_HEAD
  714. hasher_t h;
  715. const char *name;
  716. } Hasher;
  717. static int
  718. Hasher_init(PyObject *s, PyObject *args, PyObject *kwds) {
  719. Hasher *self = (Hasher*)s;
  720. static char *kwlist[] = {"which", "data", NULL};
  721. const char *which = "xxh3-64";
  722. RAII_PY_BUFFER(data);
  723. if (!PyArg_ParseTupleAndKeywords(args, kwds, "|sy*", kwlist, &which, &data)) return -1;
  724. if (strcmp(which, "xxh3-64") == 0) {
  725. self->h = xxh64_hasher();
  726. self->name = "xxh3-64";
  727. } else if (strcmp(which, "xxh3-128") == 0) {
  728. self->h = xxh128_hasher();
  729. self->name = "xxh3-128";
  730. } else {
  731. PyErr_Format(PyExc_KeyError, "Unknown hash type: %s", which);
  732. return -1;
  733. }
  734. self->h.state = self->h.new();
  735. if (self->h.state == NULL) { PyErr_NoMemory(); return -1; }
  736. if (data.buf && data.len > 0) {
  737. self->h.update(self->h.state, data.buf, data.len);
  738. }
  739. return 0;
  740. }
  741. static void
  742. Hasher_dealloc(PyObject *self) {
  743. Hasher *h = (Hasher*)self;
  744. if (h->h.state) { h->h.delete(h->h.state); h->h.state = NULL; }
  745. Py_TYPE(self)->tp_free(self);
  746. }
  747. static PyObject*
  748. reset(Hasher *self, PyObject *args UNUSED) {
  749. if (!self->h.reset(self->h.state)) return PyErr_NoMemory();
  750. Py_RETURN_NONE;
  751. }
  752. static PyObject*
  753. update(Hasher *self, PyObject *o) {
  754. RAII_PY_BUFFER(data);
  755. if (PyObject_GetBuffer(o, &data, PyBUF_SIMPLE) == -1) return NULL;
  756. if (data.buf && data.len > 0) {
  757. self->h.update(self->h.state, data.buf, data.len);
  758. }
  759. Py_RETURN_NONE;
  760. }
  761. static PyObject*
  762. digest(Hasher *self, PyObject *args UNUSED) {
  763. PyObject *ans = PyBytes_FromStringAndSize(NULL, self->h.hash_size);
  764. if (ans) self->h.digest(self->h.state, PyBytes_AS_STRING(ans));
  765. return ans;
  766. }
  767. static PyObject*
  768. digest64(Hasher *self, PyObject *args UNUSED) {
  769. if (self->h.digest64 == NULL) { PyErr_SetString(PyExc_TypeError, "Does not support 64-bit digests"); return NULL; }
  770. unsigned long long a = self->h.digest64(self->h.state);
  771. return PyLong_FromUnsignedLongLong(a);
  772. }
  773. static PyObject*
  774. hexdigest(Hasher *self, PyObject *args UNUSED) {
  775. uint8_t digest[64]; char hexdigest[128];
  776. self->h.digest(self->h.state, digest);
  777. bytes_as_hex(digest, self->h.hash_size, hexdigest);
  778. return PyUnicode_FromStringAndSize(hexdigest, self->h.hash_size * 2);
  779. }
  780. static PyObject*
  781. Hasher_digest_size(Hasher* self, void* closure UNUSED) { return PyLong_FromSize_t(self->h.hash_size); }
  782. static PyObject*
  783. Hasher_block_size(Hasher* self, void* closure UNUSED) { return PyLong_FromSize_t(self->h.block_size); }
  784. static PyObject*
  785. Hasher_name(Hasher* self, void* closure UNUSED) { return PyUnicode_FromString(self->name); }
  786. static PyMethodDef Hasher_methods[] = {
  787. METHODB(update, METH_O),
  788. METHODB(digest, METH_NOARGS),
  789. METHODB(digest64, METH_NOARGS),
  790. METHODB(hexdigest, METH_NOARGS),
  791. METHODB(reset, METH_NOARGS),
  792. {NULL} /* Sentinel */
  793. };
  794. PyGetSetDef Hasher_getsets[] = {
  795. {"digest_size", (getter)Hasher_digest_size, NULL, NULL, NULL},
  796. {"block_size", (getter)Hasher_block_size, NULL, NULL, NULL},
  797. {"name", (getter)Hasher_name, NULL, NULL, NULL},
  798. {NULL}
  799. };
  800. PyTypeObject Hasher_Type = {
  801. PyVarObject_HEAD_INIT(NULL, 0)
  802. .tp_name = "rsync.Hasher",
  803. .tp_basicsize = sizeof(Hasher),
  804. .tp_dealloc = Hasher_dealloc,
  805. .tp_flags = Py_TPFLAGS_DEFAULT,
  806. .tp_doc = "Hasher",
  807. .tp_methods = Hasher_methods,
  808. .tp_new = PyType_GenericNew,
  809. .tp_init = Hasher_init,
  810. .tp_getset = Hasher_getsets,
  811. };
  812. // }}} end Hasher
  813. static bool
  814. call_ftc_callback(PyObject *callback, char *src, Py_ssize_t key_start, Py_ssize_t key_length, Py_ssize_t val_start, Py_ssize_t val_length) {
  815. while(src[key_start] == ';' && key_length > 0 ) { key_start++; key_length--; }
  816. RAII_PyObject(k, PyMemoryView_FromMemory(src + key_start, key_length, PyBUF_READ));
  817. if (!k) return false;
  818. RAII_PyObject(v, PyMemoryView_FromMemory(src + val_start, val_length, PyBUF_READ));
  819. if (!v) return false;
  820. RAII_PyObject(ret, PyObject_CallFunctionObjArgs(callback, k, v, NULL));
  821. return ret != NULL;
  822. }
  823. static PyObject*
  824. parse_ftc(PyObject *self UNUSED, PyObject *args) {
  825. RAII_PY_BUFFER(buf);
  826. PyObject *callback;
  827. size_t i = 0, key_start = 0, key_length = 0, val_start = 0, val_length = 0;
  828. if (!PyArg_ParseTuple(args, "s*O", &buf, &callback)) return NULL;
  829. char *src = buf.buf;
  830. size_t sz = buf.len;
  831. if (!PyCallable_Check(callback)) { PyErr_SetString(PyExc_TypeError, "callback must be callable"); return NULL; }
  832. for (i = 0; i < sz; i++) {
  833. char ch = src[i];
  834. if (key_length == 0) {
  835. if (ch == '=') {
  836. key_length = i - key_start;
  837. val_start = i + 1;
  838. }
  839. } else {
  840. if (ch == ';') {
  841. val_length = i - val_start;
  842. if (!call_ftc_callback(callback, src, key_start, key_length, val_start, val_length)) return NULL;
  843. key_length = 0; key_start = i + 1; val_start = 0;
  844. }
  845. }
  846. }
  847. if (key_length && val_start) {
  848. val_length = sz - val_start;
  849. if (!call_ftc_callback(callback, src, key_start, key_length, val_start, val_length)) return NULL;
  850. }
  851. Py_RETURN_NONE;
  852. }
  853. static PyObject*
  854. pyxxh128_hash(PyObject *self UNUSED, PyObject *b) {
  855. RAII_PY_BUFFER(data);
  856. if (PyObject_GetBuffer(b, &data, PyBUF_SIMPLE) == -1) return NULL;
  857. XXH128_canonical_t c;
  858. XXH128_canonicalFromHash(&c, XXH3_128bits(data.buf, data.len));
  859. return PyBytes_FromStringAndSize((char*)c.digest, sizeof(c.digest));
  860. }
  861. static PyObject*
  862. pyxxh128_hash_with_seed(PyObject *self UNUSED, PyObject *args) {
  863. RAII_PY_BUFFER(data);
  864. unsigned long long seed;
  865. if (!PyArg_ParseTuple(args, "y*K", &data, &seed)) return NULL;
  866. XXH128_canonical_t c;
  867. XXH128_canonicalFromHash(&c, XXH3_128bits_withSeed(data.buf, data.len, seed));
  868. return PyBytes_FromStringAndSize((char*)c.digest, sizeof(c.digest));
  869. }
  870. static PyMethodDef module_methods[] = {
  871. {"parse_ftc", parse_ftc, METH_VARARGS, ""},
  872. {"xxh128_hash", pyxxh128_hash, METH_O, ""},
  873. {"xxh128_hash_with_seed", pyxxh128_hash_with_seed, METH_VARARGS, ""},
  874. {NULL, NULL, 0, NULL} /* Sentinel */
  875. };
  876. static int
  877. exec_module(PyObject *m) {
  878. RsyncError = PyErr_NewException("rsync.RsyncError", NULL, NULL);
  879. if (RsyncError == NULL) return -1;
  880. PyModule_AddObject(m, "RsyncError", RsyncError);
  881. #define T(which) if (PyType_Ready(& which##_Type) < 0) return -1; Py_INCREF(&which##_Type);\
  882. if (PyModule_AddObject(m, #which, (PyObject *) &which##_Type) < 0) return -1;
  883. T(Hasher); T(Patcher); T(Differ);
  884. #undef T
  885. return 0;
  886. }
  887. IGNORE_PEDANTIC_WARNINGS
  888. static PyModuleDef_Slot slots[] = { {Py_mod_exec, (void*)exec_module}, {0, NULL} };
  889. END_IGNORE_PEDANTIC_WARNINGS
  890. static struct PyModuleDef module = {
  891. .m_base = PyModuleDef_HEAD_INIT,
  892. .m_name = "rsync", /* name of module */
  893. .m_doc = NULL,
  894. .m_slots = slots,
  895. .m_methods = module_methods
  896. };
  897. EXPORTED PyMODINIT_FUNC
  898. PyInit_rsync(void) {
  899. return PyModuleDef_Init(&module);
  900. }
  901. // }}}