From da0e1aa69defa7cbc87209966c751918f523f1fb Mon Sep 17 00:00:00 2001 From: Piotr Szarmanski Date: Sat, 5 Aug 2023 22:57:30 +0200 Subject: Encoder refactor, new tests and proper non-file stream handling --- README | 13 ++--- src/backend.lisp | 15 ++++-- src/eris.lisp | 126 +++++++++++++++++++++++++++++++++++----------- tests/backend-tests.lisp | 9 ---- tests/common.lisp | 15 ++++++ tests/decode-tests.lisp | 18 +++---- tests/encode-tests.lisp | 128 ++++++++++++++++++++++++++++++++++++++++++----- 7 files changed, 250 insertions(+), 74 deletions(-) diff --git a/README b/README index b428895..53d4c6d 100644 --- a/README +++ b/README @@ -24,19 +24,13 @@ block-urn-to-reference The eris-encode (INPUT BLOCK-SIZE OUTPUT-FUNCTION &KEY SECRET HASH-OUTPUT) -function can be used to encode a vector or a stream into an ERIS -read-capability. +function can be used to encode a vector, stream or pathname into an ERIS +read-capability. The eris-decode (READ-CAPABILITY FETCH-FUNCTION &KEY (CACHE-CAPACITY 2048)) function can be used to decode an ERIS read-capability. It returns a stream of the class ERIS-DECODE-STREAM: this class implements the Gray streams protocol. -In addition, on POSIX systems, eris-decode-parallel (READ-CAPABILITY -FETCH-FUNCTION OUTPUT-FILE &KEY (CACHE-CAPACITY 4096) (THREADS 4) -(INITIAL-BINDINGS *DEFAULT-SPECIAL-BINDINGS*)) function is available. This -function will attempt to decode an ERIS read-capability in parallel into a file -specified by the OUTPUT-FILE string or pathspec. - See the docstrings of the specific functions for more details. However, you should only use these to write custom backends; otherwise, see below.. @@ -48,8 +42,7 @@ fetch-function, caching details, block-size, etc. and the {en/de}coding functions simply take the backend as an argument. This interface consists of two generic functions: store-data, for encoding data, -and fetch-read-capability, for retrieving the contents of a read-capability -object. +and fetch-data, for retrieving the contents from a read-capability object. As an example, a file-based backend called file-backend is provided. It can be used simply by making an instance of the 'file-backend class with a :directory diff --git a/src/backend.lisp b/src/backend.lisp index 8d1405b..d2b81b2 100644 --- a/src/backend.lisp +++ b/src/backend.lisp @@ -27,11 +27,14 @@ "Using the BACKEND, return a stream that decodes the provided READ-CAPABILITY object.")) -(defgeneric store-data (input backend &key secret &allow-other-keys) +(defgeneric store-data (input backend &key secret block-size &allow-other-keys) (:documentation "Using the BACKEND, store the INPUT, which is either a stream or an octet vector. An additional 32-byte octet-vector SECRET can be provided in order to -protect the data from attacks against convergent encryption.")) +protect the data from attacks against convergent encryption. + +BLOCK-SIZE is by default 32kib, except if the input is a file or vector with a +size less than 16kib. It should be set either to 1024b or 32kib.")) ;; Default methods @@ -41,13 +44,15 @@ protect the data from attacks against convergent encryption.")) (with-slots (fetch-function) backend (eris-decode read-capability fetch-function))) -(defmethod store-data (input (backend encoding-backend) &key (secret null-secret) &allow-other-keys) +(defmethod store-data (input (backend encoding-backend) &key (secret null-secret) (block-size 32kib) &allow-other-keys) (declare (type octet-vector secret)) (with-slots (output-function) backend (eris-encode input ;; According to ERIS spec recommendation. (if (> (etypecase input - (stream (file-length input)) - (vector (length input))) + (pathname (file-size input)) + (file-stream (file-length input)) + (vector (length input)) + (t block-size)) 16384) 32kib 1kib) diff --git a/src/eris.lisp b/src/eris.lisp index 7efdc73..196bcce 100644 --- a/src/eris.lisp +++ b/src/eris.lisp @@ -133,26 +133,76 @@ versioning bytes are not supported by eris-cl." (declare (type string urn)) (base32-to-bytes-unpadded (subseq urn (1+ (position #\: urn :from-end t))))) +;; This macro assumes that there are variables BLOCK, SECRET and OUTPUT-FUNCTION +;; in the lexenv. -(defun pad (input block-size) - (declare (type octet-vector input) - (type integer block-size)) - (let* ((pad-size (- block-size (mod (length input) block-size))) - (padded-input (adjust-array input (+ pad-size (length input)) :initial-element 0))) - (replace padded-input input) - (setf (aref padded-input (length input)) #x80) - padded-input)) - -(defmacro output-block (ref-vector) +(defmacro output-block (rks i) `(let ((rk (encrypt-block block secret))) - (vector-push-extend rk ,ref-vector) + (setf (svref ,rks ,i) rk) (funcall output-function block (subseq rk 0 32)))) + (defmacro output-internal-block (ref-vector nonce) `(let ((rk (encrypt-internal-block block ,nonce))) (vector-push-extend rk ,ref-vector) (funcall output-function block (subseq rk 0 32)))) + +;; These CHUNK- functions are written in order to allow processing files in +;; parallel. + +(defun chunk-array (array block-size output-function secret &key pad) + "Split (SIMPLE-ARRAY (UNSIGNED-BYTE 8) that is a multiple of BLOCK-SIZE into +chunks, output them and collect references. Returns a vector of references. + +Pass PAD as T if the output should be padded." + (declare (type block-size block-size) + (type octet-vector array)) + (let ((blocks (if pad + (/ (+ (length array) (- block-size (mod (length array) block-size))) block-size) + (/ (length array) block-size)))) + (let ((block (make-octet-vector block-size)) + (rks (make-array blocks :element-type 'octet-vector :initial-element null-secret))) + (loop for i from 0 below (1- blocks) + do (progn + (replace block array :start2 (* block-size i)) + (setf block (output-block rks i)))) + ;; handle last block + (replace block array :start2 (* block-size (1- blocks))) + (when pad + (setf (aref block (mod (length array) block-size)) #x80) + (fill block 0 :start (1+ (mod (length array) block-size)))) + (output-block rks (1- blocks)) + rks))) + + +;; Implementation note: This is CHUNK-ARRAY but copypasted with (LENGTH ARRAY) +;; changed to LENGTH and REPLACE changed to READ-SEQUENCE. It is, however, more +;; memory-efficient than reading a file into an array and then chunking it. + +(defun chunk-stream (stream block-size output-function length secret &key pad) + "Like CHUNK-ARRAY, but with streams. LENGTH indicates the amount of bytes to +read and should be a multiple of BLOCK-SIZE unless PAD is T." + (declare (type block-size block-size) + (type integer length)) + (let ((blocks (if pad + (/ (+ length (- block-size (mod length block-size))) block-size) + (/ length block-size)))) + (let ((block (make-octet-vector block-size)) + ;; initialize with null-secret to please SBCL + (rks (make-array blocks :element-type 'octet-vector :initial-element null-secret))) + (loop for i from 0 below (1- blocks) + do (progn + (read-sequence block stream ) + (setf block (output-block rks i)))) + ;; handle last block + (read-sequence block stream) + (when pad + (setf (aref block (mod length block-size)) #x80) + (fill block 0 :start (1+ (mod length block-size)))) + (output-block rks (1- blocks)) + rks))) + (defgeneric eris-encode (input block-size output-function &key secret) (:documentation "Encode an INPUT into BLOCK-SIZE (32kib or 1kib) blocks, that are output using @@ -161,30 +211,40 @@ encoded block and a 32-byte reference octet vector, and it MUST return a (SIMPLE-ARRAY (UNSIGNED-BYTE 8)) of equal size to the one given, which will be destructively modified. Returns a read-capability object. -An optional 32-byte secret can be passed for additional encryption using the -SECRET keyword argument.")) +A SECRET can be provided to use with encryption; otherwise the null secret (* 32 0x0) +is used.")) (defmethod eris-encode ((input vector) block-size output-function &key (secret null-secret)) (declare (type block-size block-size) (type function output-function) (type (octet-vector 32) secret)) + (eris-create-tree + (chunk-array input block-size output-function secret :pad t) + block-size output-function)) + +(defmethod eris-encode ((input pathname) block-size output-function &key (secret null-secret)) + (declare (type block-size block-size) + (type function output-function) + (type (octet-vector 32) secret)) + (with-open-file (f input :element-type 'octet) + (eris-create-tree + (chunk-stream f block-size output-function (file-length f) secret :pad t) + block-size output-function))) - (setf input (pad input block-size)) - - (let ((reference-vector (make-array 16 :adjustable t :fill-pointer 0)) - (block (make-array block-size :element-type 'octet :initial-element 0))) - (declare (type octet-vector block)) - (loop for i = 0 then (incf i) - until (= (length input) (* i block-size)) - do (progn (replace block input :start2 (* i block-size)) - (setf block (output-block reference-vector)) - (fill block 0))) - ;; always bzero the buffer; this is unoptimal (it only needs to be zeroed out to eliminate trailing junk) - ;; TODO: consider removing this entire function and replacing it with an octet stream - (eris-create-tree reference-vector block-size output-function))) +(defmethod eris-encode ((input file-stream) block-size output-function &key (secret null-secret)) + (declare (type block-size block-size) + (type function output-function) + (type (octet-vector 32) secret)) + (eris-create-tree + (chunk-stream input block-size output-function + (- (file-length input) (file-position input)) + secret :pad t) + block-size output-function)) + +;; This is the odd one out because it is not possible to determine the length of +;; a non-file stream (modulo broadcast and synonym streams). (defmethod eris-encode ((input stream) block-size output-function &key (secret null-secret)) - "This method does not handle any IO related conditions." (declare (type block-size block-size) (type function output-function) (type (octet-vector 32) secret)) @@ -195,8 +255,10 @@ SECRET keyword argument.")) for i = 0 then (incf i) if (< bytes-read block-size) do (progn (setf (aref block bytes-read) #x80) - (fill block 0 :start (1+ bytes-read))) ;; bzero the buffer here to eliminate trailing junk - do (progn (setf block (output-block reference-vector))) + (fill block 0 :start (1+ bytes-read))) + do (progn (setf block (let ((rk (encrypt-block block secret))) + (vector-push-extend rk reference-vector) + (funcall output-function block (subseq rk 0 32))))) until (< bytes-read block-size)) (eris-create-tree reference-vector block-size output-function))) @@ -231,3 +293,9 @@ SECRET keyword argument.")) (output-internal-block reference-vector-l nonce))) (setf reference-vector reference-vector-l) (setf reference-vector-l (make-array 16 :adjustable t :fill-pointer 0))))) + + + + + + diff --git a/tests/backend-tests.lisp b/tests/backend-tests.lisp index dc411d5..0f60267 100644 --- a/tests/backend-tests.lisp +++ b/tests/backend-tests.lisp @@ -34,15 +34,6 @@ (test-hash-backend (make-octets 16834 :element 5) 32kib) (test-hash-backend (make-octets 96000 :element 5) 32kib)) -(defun make-temporary-dir () - (let* ((tmpdir (uiop:temporary-directory)) - (tmp-tmpdir (make-pathname :directory (serapeum:append1 - (pathname-directory tmpdir) - (ironclad:byte-array-to-hex-string (ironclad:random-data 10))) - :defaults tmpdir))) - (ensure-directories-exist tmp-tmpdir) - tmp-tmpdir)) - (defmacro test-file-backend (array &optional (secret null-secret)) `(let ((tmpdir (make-temporary-dir))) (unwind-protect diff --git a/tests/common.lisp b/tests/common.lisp index 99a85c9..7a9309f 100644 --- a/tests/common.lisp +++ b/tests/common.lisp @@ -21,3 +21,18 @@ (defmacro make-octet-array-with-loop (loop) `(let ((seq ,loop)) (make-array (length seq) :element-type '(unsigned-byte 8) :initial-contents seq) )) + +(defun make-temporary-dir () + (let* ((tmpdir (uiop:temporary-directory)) + (tmp-tmpdir (make-pathname :directory (serapeum:append1 + (pathname-directory tmpdir) + (ironclad:byte-array-to-hex-string (ironclad:random-data 10))) + :defaults tmpdir))) + (ensure-directories-exist tmp-tmpdir) + tmp-tmpdir)) + +(defmacro with-temporary-dir (sym &body expr) + `(let ((,sym (make-temporary-dir))) + (unwind-protect + (progn ,@expr) + (uiop:delete-directory-tree ,sym :validate t)))) diff --git a/tests/decode-tests.lisp b/tests/decode-tests.lisp index 5053d11..918cd82 100644 --- a/tests/decode-tests.lisp +++ b/tests/decode-tests.lisp @@ -96,7 +96,9 @@ (assert-array-decode (make-octets 16385 :element 8) 1024) (assert-array-decode (make-octets 32767 :element 9) 1024) (assert-array-decode (make-octets 32768 :element 10) 1024) - (assert-array-decode (make-octets 131072 :element 11) 1024)) + (assert-array-decode (make-octets 131072 :element 11) 1024) + (for-all ((buffer (gen-buffer :length (gen-integer :min 0 :max 40000)))) + (assert-array-decode buffer 1024))) (test simple-decoding-32kib (assert-array-decode (make-octets 1 :element 2) 32kib) @@ -104,7 +106,9 @@ (assert-array-decode (make-octets 32768 :element 2) 32kib) (assert-array-decode (make-octets 32769 :element 2) 32kib) (assert-array-decode (make-octets 32768 :element 2) 32kib) - (assert-array-decode (make-octets 16777216 :element 2) 32kib)) + (assert-array-decode (make-octets 16777216 :element 2) 32kib) + (for-all ((buffer (gen-buffer :length (gen-integer :min 0 :max 70000)))) + (assert-array-decode buffer 32kib))) (test proper-return-values (assert-bytes-read (make-octets 1 :element 3) 1024 (1)) @@ -127,12 +131,6 @@ (stream (eris-decode read-capability #'hashtable-decode))) (setf (stream-file-position stream) ,pos) (stream-read-sequence stream buf 0 (length buf)) - ;; (print (pos (buffer stream))) - ;; (print (+ 24 ,buffer-pos)) - ;; (print (pos stream)) - ;; (print (+ 24 ,pos)) - ;; (print buf) - ;; (print ,array-at-pos) (is (and (eql (eris::pos (eris::buffer stream)) (+ 24 ,buffer-pos)) @@ -200,7 +198,9 @@ (assert-length (make-array 1024 :element-type '(unsigned-byte 8) :initial-element 2) 1024) (assert-length (make-array 2048 :element-type '(unsigned-byte 8) :initial-element 2) 1024) (assert-length (make-array 16383 :element-type '(unsigned-byte 8) :initial-element 2) 1024) - (assert-length (make-array 16384 :element-type '(unsigned-byte 8) :initial-element 2) 1024)) + (assert-length (make-array 16384 :element-type '(unsigned-byte 8) :initial-element 2) 1024) + (for-all ((buffer (gen-buffer :length (gen-integer :min 0 :max 40000)))) + (assert-length buffer 1024))) (defmacro assert-read-byte (array block-size) diff --git a/tests/encode-tests.lisp b/tests/encode-tests.lisp index abbeb0d..b4c6892 100644 --- a/tests/encode-tests.lisp +++ b/tests/encode-tests.lisp @@ -16,6 +16,7 @@ (in-package :eris/test) (def-suite* encoding-tests :in eris-tests) + (defmacro check-urn (data block-size urn &key (secret null-secret)) `(let ((urn ,urn) (vector-encode (read-capability-to-urn @@ -71,17 +72,120 @@ ,urn))))) (test 100MiB - (large-content-test (make-array 24 :element-type '(unsigned-byte 8) - :initial-contents - #(49 48 48 77 105 66 32 40 98 108 111 99 107 32 115 105 122 101 32 49 75 105 66 41)) - 1024 - "urn:eris:BIC6F5EKY2PMXS2VNOKPD3AJGKTQBD3EXSCSLZIENXAXBM7PCTH2TCMF5OKJWAN36N4DFO6JPFZBR3MS7ECOGDYDERIJJ4N5KAQSZS67YY" - 104857600)) + (large-content-test + (make-array 24 :element-type '(unsigned-byte 8) + :initial-contents + #(49 48 48 77 105 66 32 40 98 108 111 99 107 32 115 105 122 101 32 49 75 105 66 41)) + 1024 + "urn:eris:BIC6F5EKY2PMXS2VNOKPD3AJGKTQBD3EXSCSLZIENXAXBM7PCTH2TCMF5OKJWAN36N4DFO6JPFZBR3MS7ECOGDYDERIJJ4N5KAQSZS67YY" + 104857600)) (test 1GiB - (large-content-test (make-array 23 :element-type '(unsigned-byte 8) - :initial-contents - #(49 71 105 66 32 40 98 108 111 99 107 32 115 105 122 101 32 51 50 75 105 66 41)) - 32kib - "urn:eris:B4BL4DKSEOPGMYS2CU2OFNYCH4BGQT774GXKGURLFO5FDXAQQPJGJ35AZR3PEK6CVCV74FVTAXHRSWLUUNYYA46ZPOPDOV2M5NVLBETWVI" - 1073741824)) + (large-content-test + (make-array 23 :element-type '(unsigned-byte 8) + :initial-contents + #(49 71 105 66 32 40 98 108 111 99 107 32 115 105 122 101 32 51 50 75 105 66 41)) + 32kib + "urn:eris:B4BL4DKSEOPGMYS2CU2OFNYCH4BGQT774GXKGURLFO5FDXAQQPJGJ35AZR3PEK6CVCV74FVTAXHRSWLUUNYYA46ZPOPDOV2M5NVLBETWVI" + 1073741824)) + + +(defmacro encode-consensus-test (tmpdir data block-size &key (secret (random-data 32))) + "Test if all the eris-encode methods give the same results." + `(let ((pathname-encode + (let ((pathname (merge-pathnames (crypto:byte-array-to-hex-string (crypto:random-data 16)) + ,tmpdir))) + (with-open-file (f pathname + :direction :output + :element-type 'serapeum:octet + :if-does-not-exist :create) + (write-sequence ,data f)) + (read-capability-to-urn + (eris-encode pathname + ,block-size + (lambda (block ref) (declare (ignore ref)) block) + :secret ,secret)))) + (file-stream-encode + (let ((pathname (merge-pathnames (crypto:byte-array-to-hex-string (crypto:random-data 16)) + ,tmpdir))) + (with-open-file (f pathname + :direction :output + :element-type 'serapeum:octet + :if-does-not-exist :create) + (write-sequence ,data f)) + (read-capability-to-urn + (with-open-file (f pathname :direction :input + :element-type 'serapeum:octet) + (eris-encode f + ,block-size + (lambda (block ref) (declare (ignore ref)) block) + :secret ,secret))))) + (vector-encode + (read-capability-to-urn + (eris-encode ,data + ,block-size + (lambda (block ref) (declare (ignore ref)) block) + :secret ,secret))) + (stream-encode + (read-capability-to-urn + (with-octet-input-stream (stream ,data) + (eris-encode stream + ,block-size + (lambda (block ref) (declare (ignore ref)) block) + :secret ,secret))))) + (is (serapeum:equalp* vector-encode stream-encode pathname-encode file-stream-encode)))) + +(test encoding-consensus-tests + (with-temporary-dir tdir + (encode-consensus-test tdir (make-octets 1 :element 2) 1024) + (encode-consensus-test tdir (make-octets 512 :element 2) 1024) + (encode-consensus-test tdir (make-octets 1023 :element 2) 1024) + (encode-consensus-test tdir (make-octets 1024 :element 2) 1024) + (encode-consensus-test tdir (make-octets 16383 :element 2) 1024) + (encode-consensus-test tdir (make-octets 16384 :element 2) 1024) + (encode-consensus-test tdir (make-octets 1024 :element 2) 32kib) + (encode-consensus-test tdir (make-octets 32767 :element 2) 32kib) + (encode-consensus-test tdir (make-octets 32768 :element 2) 32kib) + (encode-consensus-test tdir (make-octets 64000 :element 2) 32kib) + (for-all ((buffer (gen-buffer :length (gen-integer :min 1 :max 70000)))) + (encode-consensus-test tdir buffer 1024) + (encode-consensus-test tdir buffer 32kib)))) + + +(test encoding-nothing + (with-temporary-dir tdir + (encode-consensus-test tdir (make-octets 0) 1024) + (encode-consensus-test tdir (make-octets 0) 32kib))) + + +(defmacro encoding-file-pos (tmpdir data pos block-size &key (secret null-secret)) + `(let ((vector-encode (read-capability-to-urn + (eris-encode (subseq ,data ,pos) + ,block-size + (lambda (block ref) (declare (ignore ref)) block) + :secret ,secret))) + (file-stream-encode + (let ((pathname (merge-pathnames (crypto:byte-array-to-hex-string (crypto:random-data 16)) + ,tmpdir))) + (with-open-file (f pathname + :direction :output + :element-type 'serapeum:octet + :if-does-not-exist :create) + (write-sequence ,data f)) + (read-capability-to-urn + (with-open-file (f pathname :direction :input + :element-type 'serapeum:octet) + (file-position f ,pos) + (eris-encode f + ,block-size + (lambda (block ref) (declare (ignore ref)) block) + :secret ,secret)))))) + (is (equalp vector-encode file-stream-encode)))) + +(test encoding-file-position-tests + (with-temporary-dir tdir + (encoding-file-pos tdir (make-octets 1024 :element 2) 512 1024) + (encoding-file-pos tdir (make-octets 1024 :element 2) 1023 1024) + (encoding-file-pos tdir (make-octets 1024 :element 2) 1 1024) + (encoding-file-pos tdir (make-octets 32000 :element 2) 1673 32kib) + (encoding-file-pos tdir (make-octets 32000 :element 2) 31999 32kib))) -- cgit v1.2.3