From ff9f3517ec01ede9e1b790cf6fbc7e4fa603d5a1 Mon Sep 17 00:00:00 2001 From: ZauberNerd Date: Fri, 7 Jul 2023 15:21:20 +0200 Subject: [PATCH] Replace Node.js's Buffer with native Typed Arrays on the client-side Vite does not natively ship a Buffer polyfill and most of the functionality that is required here, can be implemented natively (except for the byte-wise compare, for that I had to write my own function). --- app/package.json | 1 - app/src/core/tokenizer/bpe.ts | 23 ++++++++++++++--------- app/src/core/utils/index.ts | 23 +++++++++++++++++++++++ 3 files changed, 37 insertions(+), 10 deletions(-) diff --git a/app/package.json b/app/package.json index 33f1bbf..4303d16 100644 --- a/app/package.json +++ b/app/package.json @@ -13,7 +13,6 @@ "@msgpack/msgpack": "^3.0.0-beta2", "@reduxjs/toolkit": "^1.9.3", "broadcast-channel": "^4.20.2", - "buffer": "^6.0.3", "comlink": "^4.4.1", "events": "^3.3.0", "idb-keyval": "^6.2.0", diff --git a/app/src/core/tokenizer/bpe.ts b/app/src/core/tokenizer/bpe.ts index e8d5b96..c71cb3b 100644 --- a/app/src/core/tokenizer/bpe.ts +++ b/app/src/core/tokenizer/bpe.ts @@ -1,7 +1,12 @@ +import { compareUint8Array } from "../utils"; + const MAX_NUM_THREADS = 128; type MergeRange = { start: number, end: number }; +const textDecoder = new TextDecoder(); +const textEncoder = new TextEncoder(); + export class RankMap { private values = new Map(); @@ -14,23 +19,23 @@ export class RankMap { } public set(bytes: Uint8Array, rank: number) { - const key = Buffer.from(bytes).toString(); + const key = textDecoder.decode(bytes); this.values.set(key, rank); } public get(bytes: Uint8Array) { - const key = Buffer.from(bytes).toString(); + const key = textDecoder.decode(bytes); return this.values.get(key); } public keys() { - return Array.from(this.values.keys()).map(k => Buffer.from(k)); + return Array.from(this.values.keys()).map(k => textEncoder.encode(k)); } public inverted() { const inverted = new Map(); for (const [key, value] of Array.from(this.values.entries())) { - inverted.set(value, new Uint8Array(Buffer.from(key))); + inverted.set(value, textEncoder.encode(key)); } return inverted; } @@ -100,10 +105,10 @@ export class CoreBPE { const decoder: Map = encoder.inverted(); const specialTokensDecoder: Map = new Map( - Array.from(specialTokensEncoder.entries()).map(([k, v]) => [v, new Uint8Array(Buffer.from(k))]) + Array.from(specialTokensEncoder.entries()).map(([k, v]) => [v, textEncoder.encode(k)]) ); const sortedTokenBytes: Uint8Array[] = Array.from(encoder.keys()); - sortedTokenBytes.sort((a, b) => Buffer.compare(a, b)); + sortedTokenBytes.sort((a, b) => compareUint8Array(a, b)); this.encoder = encoder; this.specialTokensEncoder = specialTokensEncoder; @@ -136,7 +141,7 @@ export class CoreBPE { const ret: number[] = []; let match: RegExpExecArray | null; while ((match = regex.exec(text)) !== null) { - const piece = new Uint8Array(Buffer.from(match[0])); + const piece = textEncoder.encode(match[0]); const token = this.encoder.get(piece); if (token !== undefined) { ret.push(token); @@ -167,7 +172,7 @@ export class CoreBPE { const end = nextSpecial === null ? text.length : nextSpecial.index; let match: RegExpExecArray | null; while ((match = regex.exec(text.slice(start, end))) !== null) { - const piece = new Uint8Array(Buffer.from(match[0])); + const piece = textEncoder.encode(match[0]); const token = this.encoder.get(piece); if (token !== undefined) { lastPieceTokenLen = 1; @@ -208,7 +213,7 @@ export class CoreBPE { if (token !== undefined) { return token; } - const pieceStr = Buffer.from(piece).toString("utf-8"); + const pieceStr = textDecoder.decode(piece); if (this.specialTokensEncoder.has(pieceStr)) { return this.specialTokensEncoder.get(pieceStr)!; } diff --git a/app/src/core/utils/index.ts b/app/src/core/utils/index.ts index ac1ec39..89699f1 100644 --- a/app/src/core/utils/index.ts +++ b/app/src/core/utils/index.ts @@ -39,6 +39,29 @@ export function cloneArrayBuffer(buffer: ArrayBuffer): ArrayBuffer { return newBuffer; } +/** + * Lexicographically compare two `Uint8Array` instances. + * + * @param {Uint8Array} a - The first `Uint8Array` instance to compare. + * @param {Uint8Array} b - The second `Uint8Array` instance to compare. + * @returns {number} The comparison result. -1 if `a` is "less" than `b`, 1 if `a` is "greater" than `b`, or 0 if they are "equal". + */ +export function compareUint8Array(a: Uint8Array, b: Uint8Array): number { + if (a === b) return 0; + + const len = Math.min(a.byteLength, b.byteLength); + + for (let i = 0; i < len; ++i) { + if (a[i] < b[i]) return -1; + if (a[i] > b[i]) return 1; + } + + if (a.byteLength < b.byteLength) return -1; + if (a.byteLength > b.byteLength) return 1; + + return 0; +} + /** * Shares the specified text using the Web Share API if available in the user's browser. *