123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992 |
- //go:build exclude_me
- /*
- * algorithm.c
- * Copyright (C) 2023 Kovid Goyal <kovid at kovidgoyal.net>
- *
- * Distributed under terms of the GPL3 license.
- */
- #include "data-types.h"
- #include "binary.h"
- #include <math.h>
- #include <xxhash.h>
- static PyObject *RsyncError = NULL;
- static const size_t default_block_size = 6 * 1024;
- static const size_t signature_block_size = 20;
- void log_error(const char *fmt, ...) { va_list args; va_start(args, fmt); vfprintf(stderr, fmt, args); va_end(args); }
- // hashers {{{
- typedef void*(*new_hash_t)(void);
- typedef void(*delete_hash_t)(void*);
- typedef bool(*reset_hash_t)(void*);
- typedef bool(*update_hash_t)(void*, const void *input, size_t length);
- typedef void(*digest_hash_t)(const void*, void *output);
- typedef uint64_t(*digest_hash64_t)(const void*);
- typedef uint64_t(*oneshot_hash64_t)(const void*, size_t);
- typedef struct hasher_t {
- size_t hash_size, block_size;
- void *state;
- new_hash_t new;
- delete_hash_t delete;
- reset_hash_t reset;
- update_hash_t update;
- digest_hash_t digest;
- digest_hash64_t digest64;
- oneshot_hash64_t oneshot64;
- } hasher_t;
- static void xxh64_delete(void* s) { XXH3_freeState(s); }
- static bool xxh64_reset(void* s) { return XXH3_64bits_reset(s) == XXH_OK; }
- static void* xxh64_create(void) { void *ans = XXH3_createState(); if (ans != NULL) xxh64_reset(ans); return ans; }
- static bool xxh64_update(void* s, const void *input, size_t length) { return XXH3_64bits_update(s, input, length) == XXH_OK; }
- static uint64_t xxh64_digest64(const void* s) { return XXH3_64bits_digest(s); }
- static uint64_t xxh64_oneshot64(const void* s, size_t len) { return XXH3_64bits(s, len); }
- static void xxh64_digest(const void* s, void *output) {
- XXH64_hash_t ans = XXH3_64bits_digest(s);
- XXH64_canonical_t c;
- XXH64_canonicalFromHash(&c, ans);
- memcpy(output, c.digest, sizeof(c.digest));
- }
- static hasher_t
- xxh64_hasher(void) {
- hasher_t ans = {
- .hash_size=sizeof(XXH64_hash_t), .block_size = 64,
- .new=xxh64_create, .delete=xxh64_delete, .reset=xxh64_reset, .update=xxh64_update, .digest=xxh64_digest,
- .digest64=xxh64_digest64, .oneshot64=xxh64_oneshot64
- };
- return ans;
- }
- static bool xxh128_reset(void* s) { return XXH3_128bits_reset(s) == XXH_OK; }
- static void* xxh128_create(void) { void *ans = XXH3_createState(); if (ans != NULL) xxh128_reset(ans); return ans; }
- static bool xxh128_update(void* s, const void *input, size_t length) { return XXH3_128bits_update(s, input, length) == XXH_OK; }
- static void xxh128_digest(const void* s, void *output) {
- XXH128_hash_t ans = XXH3_128bits_digest(s);
- XXH128_canonical_t c;
- XXH128_canonicalFromHash(&c, ans);
- memcpy(output, c.digest, sizeof(c.digest));
- }
- static hasher_t
- xxh128_hasher(void) {
- hasher_t ans = {
- .hash_size=sizeof(XXH128_hash_t), .block_size = 64,
- .new=xxh128_create, .delete=xxh64_delete, .reset=xxh128_reset, .update=xxh128_update, .digest=xxh128_digest,
- };
- return ans;
- }
- typedef hasher_t(*hasher_constructor_t)(void);
- // }}}
- typedef struct Rsync {
- size_t block_size;
- hasher_constructor_t hasher_constructor, checksummer_constructor;
- hasher_t hasher, checksummer;
- size_t buffer_cap, buffer_sz;
- } Rsync;
- static void
- free_rsync(Rsync* r) {
- if (r->hasher.state) { r->hasher.delete(r->hasher.state); r->hasher.state = NULL; }
- if (r->checksummer.state) { r->checksummer.delete(r->checksummer.state); r->checksummer.state = NULL; }
- }
- static const char*
- init_rsync(Rsync *ans, size_t block_size, int strong_hash_type, int checksum_type) {
- memset(ans, 0, sizeof(*ans));
- ans->block_size = block_size;
- if (strong_hash_type == 0) ans->hasher_constructor = xxh64_hasher;
- if (checksum_type == 0) ans->checksummer_constructor = xxh128_hasher;
- if (ans->hasher_constructor == NULL) { free_rsync(ans); return "Unknown strong hash type"; }
- if (ans->checksummer_constructor == NULL) { free_rsync(ans); return "Unknown checksum type"; }
- ans->hasher = ans->hasher_constructor();
- ans->checksummer = ans->checksummer_constructor();
- ans->hasher.state = ans->hasher.new();
- if (ans->hasher.state == NULL) { free_rsync(ans); return "Out of memory"; }
- ans->checksummer.state = ans->checksummer.new();
- if (ans->checksummer.state == NULL) { free_rsync(ans); return "Out of memory"; }
- return NULL;
- }
- typedef struct rolling_checksum {
- uint32_t alpha, beta, val, l, first_byte_of_previous_window;
- } rolling_checksum;
- static const uint32_t _M = (1 << 16);
- static uint32_t
- rolling_checksum_full(rolling_checksum *self, uint8_t *data, uint32_t len) {
- uint32_t alpha = 0, beta = 0;
- self->l = len;
- for (uint32_t i = 0; i < len; i++) {
- alpha += data[i];
- beta += (self->l - i) * data[i];
- }
- self->first_byte_of_previous_window = data[0];
- self->alpha = alpha % _M;
- self->beta = beta % _M;
- self->val = self->alpha + _M*self->beta;
- return self->val;
- }
- inline static void
- rolling_checksum_add_one_byte(rolling_checksum *self, uint8_t first_byte, uint8_t last_byte) {
- self->alpha = (self->alpha - self->first_byte_of_previous_window + last_byte) % _M;
- self->beta = (self->beta - (self->l)*self->first_byte_of_previous_window + self->alpha) % _M;
- self->val = self->alpha + _M*self->beta;
- self->first_byte_of_previous_window = first_byte;
- }
- // Python interface {{{
- typedef struct buffer {
- uint8_t *data;
- size_t len, cap;
- } buffer;
- static bool
- ensure_space(buffer *b, size_t amt) {
- const size_t len = b->len;
- if (amt > 0 && b->cap < len + amt) {
- size_t newcap = MAX(b->cap * 2, len + (amt * 2));
- b->data = realloc(b->data, newcap);
- if (b->data == NULL) { PyErr_NoMemory(); return false; }
- b->cap = newcap;
- }
- return true;
- }
- static bool
- write_to_buffer(buffer *b, void *data, size_t len) {
- if (!ensure_space(b, len)) return false;
- memcpy(b->data + b->len, data, len);
- b->len += len;
- return true;
- }
- static void
- shift_left(buffer *b, size_t amt) {
- if (amt > b->len) amt = b->len;
- if (amt > 0) {
- b->len -= amt;
- memmove(b->data, b->data + amt, b->len);
- }
- }
- // Patcher {{{
- typedef struct {
- PyObject_HEAD
- rolling_checksum rc;
- uint64_t signature_idx;
- size_t total_data_in_delta;
- Rsync rsync;
- buffer buf, block_buf;
- PyObject *block_buf_view;
- bool checksum_done;
- } Patcher;
- static int
- Patcher_init(PyObject *s, PyObject *args, PyObject *kwds) {
- Patcher *self = (Patcher*)s;
- static char *kwlist[] = {"expected_input_size", NULL};
- unsigned long long expected_input_size = 0;
- if (!PyArg_ParseTupleAndKeywords(args, kwds, "|K", kwlist, &expected_input_size)) return -1;
- self->rsync.block_size = default_block_size;
- if (expected_input_size > 0) {
- self->rsync.block_size = (size_t)round(sqrt((double)expected_input_size));
- }
- const char *err = init_rsync(&self->rsync, self->rsync.block_size, 0, 0);
- if (err != NULL) { PyErr_SetString(RsyncError, err); return -1; }
- self->block_buf.cap = self->rsync.block_size;
- self->block_buf.data = malloc(self->rsync.block_size);
- if (self->block_buf.data == NULL) { PyErr_NoMemory(); return -1; }
- if (!(self->block_buf_view = PyMemoryView_FromMemory((char*)self->block_buf.data, self->rsync.block_size, PyBUF_WRITE))) return -1;
- return 0;
- }
- static void
- Patcher_dealloc(PyObject *self) {
- Patcher *p = (Patcher*)self;
- if (p->buf.data) free(p->buf.data);
- Py_CLEAR(p->block_buf_view);
- if (p->block_buf.data) free(p->block_buf.data);
- free_rsync(&p->rsync);
- Py_TYPE(self)->tp_free(self);
- }
- static PyObject*
- signature_header(Patcher *self, PyObject *a2) {
- RAII_PY_BUFFER(dest);
- if (PyObject_GetBuffer(a2, &dest, PyBUF_WRITEABLE) == -1) return NULL;
- static const ssize_t header_size = 12;
- if (dest.len < header_size) {
- PyErr_SetString(RsyncError, "Output buffer is too small");
- }
- uint8_t *o = dest.buf;
- le16enc(o, 0); // version
- le16enc(o + 2, 0); // checksum type
- le16enc(o + 4, 0); // strong hash type
- le16enc(o + 6, 0); // weak hash type
- le32enc(o + 8, self->rsync.block_size); // block size
- return PyLong_FromSsize_t(header_size);
- }
- static PyObject*
- sign_block(Patcher *self, PyObject *args) {
- PyObject *a1, *a2;
- if (!PyArg_ParseTuple(args, "OO", &a1, &a2)) return NULL;
- RAII_PY_BUFFER(src); RAII_PY_BUFFER(dest);
- if (PyObject_GetBuffer(a1, &src, PyBUF_SIMPLE) == -1) return NULL;
- if (PyObject_GetBuffer(a2, &dest, PyBUF_WRITEABLE) == -1) return NULL;
- if (dest.len < (ssize_t)signature_block_size) {
- PyErr_SetString(RsyncError, "Output buffer is too small");
- }
- self->rsync.hasher.reset(self->rsync.hasher.state);
- if (!self->rsync.hasher.update(self->rsync.hasher.state, src.buf, src.len)) { PyErr_SetString(PyExc_ValueError, "String hashing failed"); return NULL; }
- uint64_t strong_hash = self->rsync.hasher.oneshot64(src.buf, src.len);
- uint32_t weak_hash = rolling_checksum_full(&self->rc, src.buf, src.len);
- uint8_t *o = dest.buf;
- le64enc(o, self->signature_idx++);
- le32enc(o + 8, weak_hash);
- le64enc(o + 12, strong_hash);
- return PyLong_FromSize_t(signature_block_size);
- }
- typedef enum { OpBlock, OpData, OpHash, OpBlockRange } OpType;
- typedef struct Operation {
- OpType type;
- uint64_t block_index, block_index_end;
- struct { uint8_t *buf; size_t len; } data;
- } Operation;
- static size_t
- unserialize_op(uint8_t *data, size_t len, Operation *op) {
- size_t consumed = 0;
- switch ((OpType)(data[0])) {
- case OpBlock:
- consumed = 9;
- if (len < consumed) return 0;
- op->block_index = le64dec(data + 1);
- break;
- case OpBlockRange:
- consumed = 13;
- if (len < consumed) return 0;
- op->block_index = le64dec(data + 1);
- op->block_index_end = op->block_index + le32dec(data + 9);
- break;
- case OpHash:
- consumed = 3;
- if (len < consumed) return 0;
- op->data.len = le16dec(data + 1);
- if (len < consumed + op->data.len) return 0;
- op->data.buf = data + 3;
- consumed += op->data.len;
- break;
- case OpData:
- consumed = 5;
- if (len < consumed) return 0;
- op->data.len = le32dec(data + 1);
- if (len < consumed + op->data.len) return 0;
- op->data.buf = data + 5;
- consumed += op->data.len;
- break;
- }
- if (consumed) op->type = data[0];
- return consumed;
- }
- static bool
- write_block(Patcher *self, uint64_t block_index, PyObject *read, PyObject *write) {
- RAII_PyObject(pos, PyLong_FromUnsignedLongLong((unsigned long long)(self->rsync.block_size * block_index)));
- if (!pos) return false;
- RAII_PyObject(ret, PyObject_CallFunctionObjArgs(read, pos, self->block_buf_view, NULL));
- if (ret == NULL) return false;
- if (!PyLong_Check(ret)) { PyErr_SetString(PyExc_TypeError, "read callback function did not return an integer"); return false; }
- size_t n = PyLong_AsSize_t(ret);
- self->rsync.checksummer.update(self->rsync.checksummer.state, self->block_buf.data, n);
- RAII_PyObject(view, PyMemoryView_FromMemory((char*)self->block_buf.data, n, PyBUF_READ));
- if (!view) return false;
- RAII_PyObject(wret, PyObject_CallFunctionObjArgs(write, view, NULL));
- if (wret == NULL) return false;
- return true;
- }
- static void
- bytes_as_hex(const uint8_t *bytes, const size_t len, char *ans) {
- static const char * hex = "0123456789abcdef";
- char *pout = ans; const uint8_t *pin = bytes;
- for (; pin < bytes + len; pin++) {
- *pout++ = hex[(*pin>>4) & 0xF];
- *pout++ = hex[ *pin & 0xF];
- }
- *pout++ = 0;
- }
- static bool
- apply_op(Patcher *self, Operation op, PyObject *read, PyObject *write) {
- switch (op.type) {
- case OpBlock:
- return write_block(self, op.block_index, read, write);
- case OpBlockRange:
- for (size_t i = op.block_index; i <= op.block_index_end; i++) {
- if (!write_block(self, i, read, write)) return false;
- }
- return true;
- case OpData: {
- self->total_data_in_delta += op.data.len;
- self->rsync.checksummer.update(self->rsync.checksummer.state, op.data.buf, op.data.len);
- RAII_PyObject(view, PyMemoryView_FromMemory((char*)op.data.buf, op.data.len, PyBUF_READ));
- if (!view) return false;
- RAII_PyObject(wret, PyObject_CallFunctionObjArgs(write, view, NULL));
- if (!wret) return false;
- } return true;
- case OpHash: {
- uint8_t actual[64];
- if (op.data.len != self->rsync.checksummer.hash_size) { PyErr_SetString(RsyncError, "checksum digest not the correct size"); return false; }
- self->rsync.checksummer.digest(self->rsync.checksummer.state, actual);
- if (memcmp(actual, op.data.buf, self->rsync.checksummer.hash_size) != 0) {
- char hexdigest[129];
- bytes_as_hex(actual, self->rsync.checksummer.hash_size, hexdigest);
- RAII_PyObject(h1, PyUnicode_FromStringAndSize(hexdigest, 2*self->rsync.checksummer.hash_size));
- bytes_as_hex(op.data.buf, op.data.len, hexdigest);
- RAII_PyObject(h2, PyUnicode_FromStringAndSize(hexdigest, 2*self->rsync.checksummer.hash_size));
- PyErr_Format(RsyncError, "Failed to verify overall file checksum actual: %S != expected: %S, this usually happens because one of the involved files was altered while the operation was in progress.", h1, h2);
- return false;
- }
- self->checksum_done = true;
- } return true;
- }
- PyErr_SetString(RsyncError, "Unknown operation type");
- return false;
- }
- static PyObject*
- apply_delta_data(Patcher *self, PyObject *args) {
- PyObject *read, *write;
- RAII_PY_BUFFER(data);
- if (!PyArg_ParseTuple(args, "y*OO", &data, &read, &write)) return NULL;
- if (!write_to_buffer(&self->buf, data.buf, data.len)) return NULL;
- size_t pos = 0;
- Operation op = {0};
- while (pos < self->buf.len) {
- size_t consumed = unserialize_op(self->buf.data + pos, self->buf.len - pos, &op);
- if (!consumed) { break; }
- pos += consumed;
- if (!apply_op(self, op, read, write)) break;
- }
- shift_left(&self->buf, pos);
- if (PyErr_Occurred()) return NULL;
- Py_RETURN_NONE;
- }
- static PyObject*
- finish_delta_data(Patcher *self, PyObject *args UNUSED) {
- if (self->buf.len > 0) { PyErr_Format(RsyncError, "%zu bytes of unused delta data", self->buf.len); return NULL; }
- if (!self->checksum_done) { PyErr_SetString(RsyncError, "The checksum was not received at the end of the delta data"); return NULL; }
- Py_RETURN_NONE;
- }
- static PyObject*
- Patcher_block_size(Patcher* self, void* closure UNUSED) { return PyLong_FromSize_t(self->rsync.block_size); }
- static PyObject*
- Patcher_total_data_in_delta(Patcher* self, void* closure UNUSED) { return PyLong_FromSize_t(self->total_data_in_delta); }
- PyGetSetDef Patcher_getsets[] = {
- {"block_size", (getter)Patcher_block_size, NULL, NULL, NULL},
- {"total_data_in_delta", (getter)Patcher_total_data_in_delta, NULL, NULL, NULL},
- {NULL}
- };
- static PyMethodDef Patcher_methods[] = {
- METHODB(sign_block, METH_VARARGS),
- METHODB(signature_header, METH_O),
- METHODB(apply_delta_data, METH_VARARGS),
- METHODB(finish_delta_data, METH_NOARGS),
- {NULL} /* Sentinel */
- };
- PyTypeObject Patcher_Type = {
- PyVarObject_HEAD_INIT(NULL, 0)
- .tp_name = "rsync.Patcher",
- .tp_basicsize = sizeof(Patcher),
- .tp_dealloc = Patcher_dealloc,
- .tp_flags = Py_TPFLAGS_DEFAULT,
- .tp_doc = "Patcher",
- .tp_methods = Patcher_methods,
- .tp_new = PyType_GenericNew,
- .tp_init = Patcher_init,
- .tp_getset = Patcher_getsets,
- };
- // }}} Patcher
- // Differ {{{
- typedef struct Signature { uint64_t index, strong_hash; } Signature;
- typedef struct SignatureVal {
- Signature sig, *weak_hash_collisions;
- size_t len, cap;
- } SignatureVal;
- #define NAME SignatureMap
- #define KEY_TY int
- #define VAL_TY SignatureVal
- static void free_signature_val(SignatureVal x) { free(x.weak_hash_collisions); }
- #define VAL_DTOR_FN free_signature_val
- #include "kitty-verstable.h"
- typedef struct Differ {
- PyObject_HEAD
- rolling_checksum rc;
- uint64_t signature_idx;
- Rsync rsync;
- bool signature_header_parsed;
- buffer buf;
- SignatureMap signature_map;
- PyObject *read, *write;
- bool written, finished;
- struct { size_t pos, sz; } window, data;
- Operation pending_op; bool has_pending;
- uint8_t checksum[32];
- } Differ;
- static int
- Differ_init(PyObject *s, PyObject *args, PyObject *kwds) {
- Differ *self = (Differ*)s;
- static char *kwlist[] = {NULL};
- if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist)) return -1;
- const char *err = init_rsync(&self->rsync, default_block_size, 0, 0);
- if (err != NULL) { PyErr_SetString(RsyncError, err); return -1; }
- vt_init(&self->signature_map);
- return 0;
- }
- static void
- Differ_dealloc(PyObject *self) {
- Differ *p = (Differ*)self;
- if (p->buf.data) free(p->buf.data);
- free_rsync(&p->rsync);
- vt_cleanup(&p->signature_map);
- Py_TYPE(self)->tp_free(self);
- }
- static void
- parse_signature_header(Differ *self) {
- if (self->buf.len < 12) return;
- uint8_t *p = self->buf.data;
- uint32_t x;
- if ((x = le16dec(p)) != 0) {
- PyErr_Format(RsyncError, "Invalid version in signature header: %u", x); return;
- } p += 2;
- if ((x = le16dec(p)) != 0) {
- PyErr_Format(RsyncError, "Invalid checksum type in signature header: %u", x); return;
- } p += 2;
- if ((x = le16dec(p)) != 0) {
- PyErr_Format(RsyncError, "Invalid strong hash type in signature header: %u", x); return;
- } p += 2;
- if ((x = le16dec(p)) != 0) {
- PyErr_Format(RsyncError, "Invalid weak hash type in signature header: %u", x); return;
- } p += 2;
- const char *err = init_rsync(&self->rsync, le32dec(p), 0, 0);
- if (err != NULL) { PyErr_SetString(RsyncError, err); return; }
- p += 4;
- shift_left(&self->buf, p - self->buf.data);
- self->signature_header_parsed = true;
- }
- static bool
- add_collision(SignatureVal *sm, Signature s) {
- if (sm->cap < sm->len + 1) {
- size_t new_cap = MAX(sm->cap * 2, 8u);
- sm->weak_hash_collisions = realloc(sm->weak_hash_collisions, new_cap * sizeof(sm->weak_hash_collisions[0]));
- if (!sm->weak_hash_collisions) { PyErr_NoMemory(); return false; }
- sm->cap = new_cap;
- }
- sm->weak_hash_collisions[sm->len++] = s;
- return true;
- }
- static size_t
- parse_signature_block(Differ *self, uint8_t *data, size_t len) {
- if (len < 20) return 0;
- int weak_hash = le32dec(data + 8);
- SignatureMap_itr i = vt_get(&self->signature_map, weak_hash);
- if (vt_is_end(i)) {
- SignatureVal s = {0};
- s.sig.index = le64dec(data);
- s.sig.strong_hash = le64dec(data+12);
- vt_insert(&self->signature_map, weak_hash, s);
- } else {
- if (!add_collision(&i.data->val, (Signature){.index=le64dec(data), .strong_hash=le64dec(data+12)})) return 0;
- }
- return 20;
- }
- static PyObject*
- add_signature_data(Differ *self, PyObject *args) {
- RAII_PY_BUFFER(data);
- if (!PyArg_ParseTuple(args, "y*", &data)) return NULL;
- if (!write_to_buffer(&self->buf, data.buf, data.len)) return NULL;
- if (!self->signature_header_parsed) {
- parse_signature_header(self);
- if (PyErr_Occurred()) return NULL;
- if (!self->signature_header_parsed) { Py_RETURN_NONE; }
- }
- size_t pos = 0;
- while (pos < self->buf.len) {
- size_t consumed = parse_signature_block(self, self->buf.data + pos, self->buf.len - pos);
- if (!consumed) { break; }
- pos += consumed;
- }
- shift_left(&self->buf, pos);
- if (PyErr_Occurred()) return NULL;
- Py_RETURN_NONE;
- }
- static PyObject*
- finish_signature_data(Differ *self, PyObject *args UNUSED) {
- if (self->buf.len > 0) { PyErr_Format(RsyncError, "%zu bytes of unused signature data", self->buf.len); return NULL; }
- self->buf.len = 0;
- self->buf.cap = 8 * self->rsync.block_size;
- self->buf.data = realloc(self->buf.data, self->buf.cap);
- if (!self->buf.data) return PyErr_NoMemory();
- Py_RETURN_NONE;
- }
- static bool
- send_op(Differ *self, Operation *op) {
- uint8_t metadata[32];
- size_t len = 0;
- metadata[0] = op->type;
- switch (op->type) {
- case OpBlock:
- le64enc(metadata + 1, op->block_index);
- len = 9;
- break;
- case OpBlockRange:
- le64enc(metadata + 1, op->block_index);
- le32enc(metadata + 9, op->block_index_end - op->block_index);
- len = 13;
- break;
- case OpHash:
- le16enc(metadata + 1, op->data.len);
- memcpy(metadata + 3, op->data.buf, op->data.len);
- len = 3 + op->data.len;
- break;
- case OpData:
- le32enc(metadata + 1, op->data.len);
- len = 5;
- break;
- }
- RAII_PyObject(mv, PyMemoryView_FromMemory((char*)metadata, len, PyBUF_READ));
- RAII_PyObject(ret, PyObject_CallFunctionObjArgs(self->write, mv, NULL));
- if (ret == NULL) return false;
- if (op->type == OpData) {
- RAII_PyObject(mv, PyMemoryView_FromMemory((char*)op->data.buf, op->data.len, PyBUF_READ));
- RAII_PyObject(ret, PyObject_CallFunctionObjArgs(self->write, mv, NULL));
- if (ret == NULL) return false;
- }
- self->written = true;
- return true;
- }
- static bool
- send_pending(Differ *self) {
- bool ret = true;
- if (self->has_pending) {
- ret = send_op(self, &self->pending_op);
- self->has_pending = false;
- }
- return ret;
- }
- static bool
- send_data(Differ *self) {
- if (self->data.sz > 0) {
- if (!send_pending(self)) return false;
- Operation op = {.type=OpData};
- op.data.buf = self->buf.data + self->data.pos;
- op.data.len = self->data.sz;
- self->data.pos += self->data.sz;
- self->data.sz = 0;
- return send_op(self, &op);
- }
- return true;
- }
- static bool
- ensure_idx_valid(Differ *self, size_t idx) {
- if (idx < self->buf.len) return true;
- if (idx >= self->buf.cap) {
- // need to wrap the buffer, so send off any data present behind the window
- if (!send_data(self)) return false;
- // copy the window and any data present after it to the start of the buffer
- size_t distance_from_window_pos = idx - self->window.pos;
- size_t amt_to_copy = self->buf.len - self->window.pos;
- memmove(self->buf.data, self->buf.data + self->window.pos, amt_to_copy);
- self->buf.len = amt_to_copy;
- self->window.pos = 0;
- self->data.pos = 0;
- return ensure_idx_valid(self, distance_from_window_pos);
- }
- RAII_PyObject(mv, PyMemoryView_FromMemory((char*)self->buf.data + self->buf.len, self->buf.cap - self->buf.len, PyBUF_WRITE));
- if (!mv) return false;
- RAII_PyObject(ret, PyObject_CallFunctionObjArgs(self->read, mv, NULL));
- if (!ret) return false;
- if (!PyLong_Check(ret)) { PyErr_SetString(PyExc_TypeError, "read callback did not return an integer"); return false; }
- size_t n = PyLong_AsSize_t(ret);
- self->rsync.checksummer.update(self->rsync.checksummer.state, self->buf.data + self->buf.len, n);
- self->buf.len += n;
- return self->buf.len > idx;
- }
- static bool
- find_strong_hash(const SignatureVal *sm, uint64_t q, uint64_t *block_index) {
- if (sm->sig.strong_hash == q) { *block_index = sm->sig.index; return true; }
- for (size_t i = 0; i < sm->len; i++) {
- if (sm->weak_hash_collisions[i].strong_hash == q) { *block_index = sm->weak_hash_collisions[i].index; return true; }
- }
- return false;
- }
- static bool
- enqueue(Differ *self, Operation op) {
- switch (op.type) {
- case OpBlock:
- if (self->has_pending) {
- switch (self->pending_op.type) {
- case OpBlock:
- if (self->pending_op.block_index+1 == op.block_index) {
- self->pending_op.type = OpBlockRange;
- self->pending_op.block_index_end = op.block_index;
- return true;
- }
- break;
- case OpBlockRange:
- if (self->pending_op.block_index_end+1 == op.block_index) {
- self->pending_op.block_index_end = op.block_index;
- return true;
- }
- case OpHash: case OpData: break;
- }
- if (!send_pending(self)) return false;
- }
- self->pending_op = op;
- self->has_pending = true;
- return true;
- case OpHash:
- if (!send_pending(self)) return false;
- return send_op(self, &op);
- case OpBlockRange: case OpData:
- PyErr_SetString(RsyncError, "enqueue() must never be called with anything other than OpHash and OpBlock");
- return false;
- }
- return false;
- }
- static bool
- finish_up(Differ *self) {
- if (!send_data(self)) return false;
- self->data.pos = self->window.pos;
- self->data.sz = self->buf.len - self->window.pos;
- if (!send_data(self)) return false;
- self->rsync.checksummer.digest(self->rsync.checksummer.state, self->checksum);
- Operation op = {.type=OpHash};
- op.data.buf = self->checksum; op.data.len = self->rsync.checksummer.hash_size;
- if (!enqueue(self, op)) return false;
- self->finished = true;
- return true;
- }
- static bool
- read_next(Differ *self) {
- if (self->window.sz > 0) {
- if (!ensure_idx_valid(self, self->window.pos + self->window.sz)) {
- if (PyErr_Occurred()) return false;
- return finish_up(self);
- }
- self->window.pos++;
- self->data.sz++;
- rolling_checksum_add_one_byte(&self->rc, self->buf.data[self->window.pos], self->buf.data[self->window.pos + self->window.sz - 1]);
- } else {
- if (!ensure_idx_valid(self, self->window.pos + self->rsync.block_size - 1)) {
- if (PyErr_Occurred()) return false;
- return finish_up(self);
- }
- self->window.sz = self->rsync.block_size;
- rolling_checksum_full(&self->rc, self->buf.data + self->window.pos, self->window.sz);
- }
- int weak_hash = self->rc.val;
- uint64_t block_index = 0;
- SignatureMap_itr i = vt_get(&self->signature_map, weak_hash);
- if (!vt_is_end(i) && find_strong_hash(&i.data->val, self->rsync.hasher.oneshot64(self->buf.data + self->window.pos, self->window.sz), &block_index)) {
- if (!send_data(self)) return false;
- if (!enqueue(self, (Operation){.type=OpBlock, .block_index=block_index})) return false;
- self->window.pos += self->window.sz;
- self->data.pos = self->window.pos;
- self->window.sz = 0;
- }
- return true;
- }
- static PyObject*
- next_op(Differ *self, PyObject *args) {
- if (!PyArg_ParseTuple(args, "OO", &self->read, &self->write)) return NULL;
- self->written = false;
- while (!self->written && !self->finished) {
- if (!read_next(self)) break;
- }
- if (self->finished && !PyErr_Occurred()) {
- send_pending(self);
- }
- self->read = NULL; self->write = NULL;
- if (PyErr_Occurred()) return NULL;
- if (self->finished) { Py_RETURN_FALSE; }
- Py_RETURN_TRUE;
- }
- static PyMethodDef Differ_methods[] = {
- METHODB(add_signature_data, METH_VARARGS),
- METHODB(finish_signature_data, METH_NOARGS),
- METHODB(next_op, METH_VARARGS),
- {NULL} /* Sentinel */
- };
- PyTypeObject Differ_Type = {
- PyVarObject_HEAD_INIT(NULL, 0)
- .tp_name = "rsync.Differ",
- .tp_basicsize = sizeof(Differ),
- .tp_dealloc = Differ_dealloc,
- .tp_flags = Py_TPFLAGS_DEFAULT,
- .tp_doc = "Differ",
- .tp_methods = Differ_methods,
- .tp_new = PyType_GenericNew,
- .tp_init = Differ_init,
- };
- // }}} Differ
- // Hasher {{{
- typedef struct {
- PyObject_HEAD
- hasher_t h;
- const char *name;
- } Hasher;
- static int
- Hasher_init(PyObject *s, PyObject *args, PyObject *kwds) {
- Hasher *self = (Hasher*)s;
- static char *kwlist[] = {"which", "data", NULL};
- const char *which = "xxh3-64";
- RAII_PY_BUFFER(data);
- if (!PyArg_ParseTupleAndKeywords(args, kwds, "|sy*", kwlist, &which, &data)) return -1;
- if (strcmp(which, "xxh3-64") == 0) {
- self->h = xxh64_hasher();
- self->name = "xxh3-64";
- } else if (strcmp(which, "xxh3-128") == 0) {
- self->h = xxh128_hasher();
- self->name = "xxh3-128";
- } else {
- PyErr_Format(PyExc_KeyError, "Unknown hash type: %s", which);
- return -1;
- }
- self->h.state = self->h.new();
- if (self->h.state == NULL) { PyErr_NoMemory(); return -1; }
- if (data.buf && data.len > 0) {
- self->h.update(self->h.state, data.buf, data.len);
- }
- return 0;
- }
- static void
- Hasher_dealloc(PyObject *self) {
- Hasher *h = (Hasher*)self;
- if (h->h.state) { h->h.delete(h->h.state); h->h.state = NULL; }
- Py_TYPE(self)->tp_free(self);
- }
- static PyObject*
- reset(Hasher *self, PyObject *args UNUSED) {
- if (!self->h.reset(self->h.state)) return PyErr_NoMemory();
- Py_RETURN_NONE;
- }
- static PyObject*
- update(Hasher *self, PyObject *o) {
- RAII_PY_BUFFER(data);
- if (PyObject_GetBuffer(o, &data, PyBUF_SIMPLE) == -1) return NULL;
- if (data.buf && data.len > 0) {
- self->h.update(self->h.state, data.buf, data.len);
- }
- Py_RETURN_NONE;
- }
- static PyObject*
- digest(Hasher *self, PyObject *args UNUSED) {
- PyObject *ans = PyBytes_FromStringAndSize(NULL, self->h.hash_size);
- if (ans) self->h.digest(self->h.state, PyBytes_AS_STRING(ans));
- return ans;
- }
- static PyObject*
- digest64(Hasher *self, PyObject *args UNUSED) {
- if (self->h.digest64 == NULL) { PyErr_SetString(PyExc_TypeError, "Does not support 64-bit digests"); return NULL; }
- unsigned long long a = self->h.digest64(self->h.state);
- return PyLong_FromUnsignedLongLong(a);
- }
- static PyObject*
- hexdigest(Hasher *self, PyObject *args UNUSED) {
- uint8_t digest[64]; char hexdigest[128];
- self->h.digest(self->h.state, digest);
- bytes_as_hex(digest, self->h.hash_size, hexdigest);
- return PyUnicode_FromStringAndSize(hexdigest, self->h.hash_size * 2);
- }
- static PyObject*
- Hasher_digest_size(Hasher* self, void* closure UNUSED) { return PyLong_FromSize_t(self->h.hash_size); }
- static PyObject*
- Hasher_block_size(Hasher* self, void* closure UNUSED) { return PyLong_FromSize_t(self->h.block_size); }
- static PyObject*
- Hasher_name(Hasher* self, void* closure UNUSED) { return PyUnicode_FromString(self->name); }
- static PyMethodDef Hasher_methods[] = {
- METHODB(update, METH_O),
- METHODB(digest, METH_NOARGS),
- METHODB(digest64, METH_NOARGS),
- METHODB(hexdigest, METH_NOARGS),
- METHODB(reset, METH_NOARGS),
- {NULL} /* Sentinel */
- };
- PyGetSetDef Hasher_getsets[] = {
- {"digest_size", (getter)Hasher_digest_size, NULL, NULL, NULL},
- {"block_size", (getter)Hasher_block_size, NULL, NULL, NULL},
- {"name", (getter)Hasher_name, NULL, NULL, NULL},
- {NULL}
- };
- PyTypeObject Hasher_Type = {
- PyVarObject_HEAD_INIT(NULL, 0)
- .tp_name = "rsync.Hasher",
- .tp_basicsize = sizeof(Hasher),
- .tp_dealloc = Hasher_dealloc,
- .tp_flags = Py_TPFLAGS_DEFAULT,
- .tp_doc = "Hasher",
- .tp_methods = Hasher_methods,
- .tp_new = PyType_GenericNew,
- .tp_init = Hasher_init,
- .tp_getset = Hasher_getsets,
- };
- // }}} end Hasher
- static bool
- call_ftc_callback(PyObject *callback, char *src, Py_ssize_t key_start, Py_ssize_t key_length, Py_ssize_t val_start, Py_ssize_t val_length) {
- while(src[key_start] == ';' && key_length > 0 ) { key_start++; key_length--; }
- RAII_PyObject(k, PyMemoryView_FromMemory(src + key_start, key_length, PyBUF_READ));
- if (!k) return false;
- RAII_PyObject(v, PyMemoryView_FromMemory(src + val_start, val_length, PyBUF_READ));
- if (!v) return false;
- RAII_PyObject(ret, PyObject_CallFunctionObjArgs(callback, k, v, NULL));
- return ret != NULL;
- }
- static PyObject*
- parse_ftc(PyObject *self UNUSED, PyObject *args) {
- RAII_PY_BUFFER(buf);
- PyObject *callback;
- size_t i = 0, key_start = 0, key_length = 0, val_start = 0, val_length = 0;
- if (!PyArg_ParseTuple(args, "s*O", &buf, &callback)) return NULL;
- char *src = buf.buf;
- size_t sz = buf.len;
- if (!PyCallable_Check(callback)) { PyErr_SetString(PyExc_TypeError, "callback must be callable"); return NULL; }
- for (i = 0; i < sz; i++) {
- char ch = src[i];
- if (key_length == 0) {
- if (ch == '=') {
- key_length = i - key_start;
- val_start = i + 1;
- }
- } else {
- if (ch == ';') {
- val_length = i - val_start;
- if (!call_ftc_callback(callback, src, key_start, key_length, val_start, val_length)) return NULL;
- key_length = 0; key_start = i + 1; val_start = 0;
- }
- }
- }
- if (key_length && val_start) {
- val_length = sz - val_start;
- if (!call_ftc_callback(callback, src, key_start, key_length, val_start, val_length)) return NULL;
- }
- Py_RETURN_NONE;
- }
- static PyObject*
- pyxxh128_hash(PyObject *self UNUSED, PyObject *b) {
- RAII_PY_BUFFER(data);
- if (PyObject_GetBuffer(b, &data, PyBUF_SIMPLE) == -1) return NULL;
- XXH128_canonical_t c;
- XXH128_canonicalFromHash(&c, XXH3_128bits(data.buf, data.len));
- return PyBytes_FromStringAndSize((char*)c.digest, sizeof(c.digest));
- }
- static PyObject*
- pyxxh128_hash_with_seed(PyObject *self UNUSED, PyObject *args) {
- RAII_PY_BUFFER(data);
- unsigned long long seed;
- if (!PyArg_ParseTuple(args, "y*K", &data, &seed)) return NULL;
- XXH128_canonical_t c;
- XXH128_canonicalFromHash(&c, XXH3_128bits_withSeed(data.buf, data.len, seed));
- return PyBytes_FromStringAndSize((char*)c.digest, sizeof(c.digest));
- }
- static PyMethodDef module_methods[] = {
- {"parse_ftc", parse_ftc, METH_VARARGS, ""},
- {"xxh128_hash", pyxxh128_hash, METH_O, ""},
- {"xxh128_hash_with_seed", pyxxh128_hash_with_seed, METH_VARARGS, ""},
- {NULL, NULL, 0, NULL} /* Sentinel */
- };
- static int
- exec_module(PyObject *m) {
- RsyncError = PyErr_NewException("rsync.RsyncError", NULL, NULL);
- if (RsyncError == NULL) return -1;
- PyModule_AddObject(m, "RsyncError", RsyncError);
- #define T(which) if (PyType_Ready(& which##_Type) < 0) return -1; Py_INCREF(&which##_Type);\
- if (PyModule_AddObject(m, #which, (PyObject *) &which##_Type) < 0) return -1;
- T(Hasher); T(Patcher); T(Differ);
- #undef T
- return 0;
- }
- IGNORE_PEDANTIC_WARNINGS
- static PyModuleDef_Slot slots[] = { {Py_mod_exec, (void*)exec_module}, {0, NULL} };
- END_IGNORE_PEDANTIC_WARNINGS
- static struct PyModuleDef module = {
- .m_base = PyModuleDef_HEAD_INIT,
- .m_name = "rsync", /* name of module */
- .m_doc = NULL,
- .m_slots = slots,
- .m_methods = module_methods
- };
- EXPORTED PyMODINIT_FUNC
- PyInit_rsync(void) {
- return PyModuleDef_Init(&module);
- }
- // }}}
|