I’m trying to write an arbitrary base N chunked encoder. This is different than a mathematical encoder where the entire buffer is converted to an integer and then that integer is converted into a number. It’s supposed to work like how base 64 reads the buffer in chunks of 3 bytes and outputs 4 chars per chunk.
Non powers of 2 bases “waste” bits because the the # of bits in N bytes never align with M chars.
For base 3 in particular, I’ve computed that 12 bytes can be encoded into 61 chars with a wastage of 0.683 bits because 3**61=96.68 and 12*8=96, so 0.683 bits will be left unused.
What this looks like is:
expect(base3encoder.encode([0,0,0,0,0,0,0,0,0,0,0,1])).toBe('0000000000000000000000000000000000000000000000000000000000001')
expect(base3encoder.encode([0,0,0,0,0,0,0,0,0,0,0,2])).toBe('0000000000000000000000000000000000000000000000000000000000002')
expect(base3encoder.encode([0,0,0,0,0,0,0,0,0,0,0,3])).toBe('0000000000000000000000000000000000000000000000000000000000010')
expect(base3encoder.encode([0,0,0,0,0,0,0,0,0,0,1,0]),"256 = 1*3**5 + 1*3**2 + 1*3**1 + 1*3**0").toBe('0000000000000000000000000000000000000000000000000000000100111')
When the input is not a multiple of 12 bytes, we have to add some padding.
I’ve got this working perfectly with base 64 (it matches the native impl), but for base 3 my algo is a little off, but I can’t quite put my finger on where the problem is.
I’ve modified the code and test to run in StackOverflow. I believe the bug is in ChunkedBufferEncoder.decode but it could be in ChunkedBufferEncoder.encode. I know encode works for input bytes that are multiples of 12 (as shown above), but for anything else it gets a little crazy.
I’ve included a base64encoder in the test. That currently passes all tests and should continue to pass.
Where is the bug?
function getView(buffer) {
if (ArrayBuffer.isView(buffer)) {
return new DataView(buffer.buffer, buffer.byteOffset, buffer.byteLength);
}
if (buffer instanceof ArrayBuffer) {
return new DataView(buffer, buffer.byteLength, buffer.byteLength);
}
return new DataView(Uint8Array.from(buffer).buffer);
}
function bufToInt(buffer) {
if (buffer.length <= 1) {
if (buffer.length === 0) {
return 0n;
}
return BigInt(buffer[0]);
}
if (buffer.length >= 8) {
let i = 0;
let result = 0n;
const view = getView(buffer);
const end = buffer.length - 8;
for (;;) {
result |= BigInt(view.getBigUint64(i, false));
i += 8;
if (i >= end) {
break;
}
result <<= 64n;
}
for (;i < buffer.length; ++i) {
result = result << 8n | BigInt(buffer[i]);
}
return result;
}
let result = BigInt(buffer[0]);
for (let i = 1; i < buffer.length; ++i) {
result = result << 8n | BigInt(buffer[i]);
}
return result;
}
function calcCharsPerChunk(bytesPerChunk, base) {
const min = 2n ** BigInt(8 * bytesPerChunk);
let c = 1;
let val = base;
for (;;) {
if (val >= min) return c;
val *= base;
++c;
}
}
function toArray(arr) {
return Array.isArray(arr) ? arr : Array.from(arr);
}
class ChunkedBufferEncoder {
alphabet;
reverse;
base;
bytesPerChunk;
charsPerChunk;
constructor(alphabet, bytesPerChunk, charsPerChunk) {
this.alphabet = toArray(alphabet);
console.assert(this.alphabet.length >= 2);
this.reverse = new Map(this.alphabet.map(((ch, i) => [ ch, BigInt(i) ])));
this.base = BigInt(this.alphabet.length);
this.bytesPerChunk = bytesPerChunk;
if (charsPerChunk == null) {
this.charsPerChunk = calcCharsPerChunk(bytesPerChunk, this.base);
} else {
this.charsPerChunk = charsPerChunk;
console.assert(this.alphabet.length ** this.charsPerChunk >= 2 ** (8 * bytesPerChunk));
}
}
encode(arr) {
if (!arr?.length) {
return "";
}
const buf = Uint8Array.from(arr);
let i = 0;
let result = "";
do {
const chunk = buf.slice(i, i + this.bytesPerChunk);
let val = bufToInt(chunk);
if (chunk.length < this.bytesPerChunk) {
const missingBytes = this.bytesPerChunk - chunk.length;
val <<= 8n * BigInt(missingBytes);
result += this.intToArr(val).slice(0, -missingBytes).join("");
return result;
}
result += this.intToStr(val);
i += this.bytesPerChunk;
} while (i < buf.length);
return result;
}
padEnd(chunk) {
if (chunk.length >= this.charsPerChunk) return chunk;
return chunk.concat(Array(this.charsPerChunk - chunk.length).fill(this.alphabet[0]));
}
decode(str) {
if (!str?.length) return new Uint8Array;
const out = [];
let i = 0;
const arr = toArray(str);
while (i < arr.length) {
const chunk = arr.slice(i, i + this.charsPerChunk);
if (chunk.length === this.charsPerChunk) {
const num = this.arrToInt(chunk);
for (let j = this.bytesPerChunk - 1; j >= 0; j--) {
out.push(Number(num >> BigInt(8 * j) & 0xffn));
}
} else {
const missing = this.charsPerChunk - chunk.length;
let num = this.arrToInt(this.padEnd(chunk));
num >>= BigInt(8 * missing);
const byteCount = this.bytesPerChunk - missing;
for (let j = byteCount - 1; j >= 0; --j) {
out.push(Number(num >> BigInt(8 * j) & 0xffn));
}
break;
}
i += chunk.length;
}
return new Uint8Array(out);
}
arrToInt(arr) {
let num = 0n;
for (const ch of arr) {
num = num * this.base + this.reverse.get(ch);
}
return num;
}
strToInt(str) {
return this.arrToInt(toArray(str));
}
intToStr(num) {
if (!num) {
return this.alphabet[0].repeat(this.charsPerChunk);
}
let n = BigInt(num);
let result = "";
do {
const rem = n % this.base;
result = this.alphabet[Number(rem)] + result;
n /= this.base;
} while (n > 0n);
return result.padStart(this.charsPerChunk, this.alphabet[0]);
}
intToArr(num) {
if (!num) {
return Array(this.charsPerChunk).fill(this.alphabet[0]);
}
let n = BigInt(num);
let result = [];
do {
const rem = n % this.base;
result.unshift(this.alphabet[Number(rem)]);
n /= this.base;
} while (n > 0n);
while (result.length < this.charsPerChunk) {
result.unshift(this.alphabet[0]);
}
return result;
}
}
// ---------- TEST ----------
function randomUint8Array(minLen, maxLen) {
return crypto.getRandomValues(new Uint8Array(Math.floor(Math.random() * (maxLen - minLen + 1)) + minLen))
}
const base64encoder = new ChunkedBufferEncoder('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/',3)
const base3encoder = new ChunkedBufferEncoder('012', 12, 61)
const NUM_TESTS = 10000
const MIN_BYTES = 1
const MAX_BYTES = 17
function arrEq(a, b) {
if (a.length !== b.length) return false;
for (let i = 0; i < a.length; i++) {
if (a[i] !== b[i]) return false;
}
return true;
}
function uint8ArrayToHex(arr) {
return Array.from(arr, b => b.toString(16).toUpperCase().padStart(2, '0')).join(' ')
}
for(const encoder of [base64encoder,base3encoder]) {
for(let i = 0; i < NUM_TESTS; i++) {
const buf = randomUint8Array(MIN_BYTES, MAX_BYTES)
const encoded = encoder.encode(buf)
const decoded = encoder.decode(encoded)
if(!arrEq(buf, decoded)) throw new Error(
`Buf: ${uint8ArrayToHex(buf)} Encoded: ${encoded} Decoded: ${uint8ArrayToHex(decoded)}`
)
}
}
It might be the .slice(0,-missingBytes).
[0,0,0,0,0,0,0,0,0,0,1] (11 elements) should be padded such that it becomes [0,0,0,0,0,0,0,0,0,0,1,0] which encodes to 0000000000000000000000000000000000000000000000000000000100111, but when decoded again it’ll come out [0,0,0,0,0,0,0,0,0,0,1,0] with the “missing byte” appended. I don’t want that. Base64 uses = as padding, but it’s not really necessary because you can figure out how many bytes/chars it’s supposed to be with a bit of math. I’m not actually positive if it’s possible to do something similar with non powers of 2.