tardis-bot/src/tts/BaseEngine.ts

import { writeFile } from "node:fs/promises";

export type VoiceParams = Record<string, unknown>;

export type VoiceResolution =
  | { kind: "exact"; voice: string }
  | { kind: "fuzzy"; voice: string }
  | { kind: "ambiguous"; candidates: string[] }
  | { kind: "none" };

/** Joins up to `max` candidate voice names for a user-facing ambiguity message, summarizing the rest as "(+N more)". */
export function formatCandidates(candidates: string[], max = 5): string {
  const shown = candidates.slice(0, max).join(", ");
  const extra = candidates.length - max;
  return extra > 0 ? `${shown} (+${extra} more)` : shown;
}

/**
 * Common contract for every TTS provider. Subclasses override either
 * `getSpeech` (returning a fetch-like Response) or `getSpeechFile` (writing
 * directly to disk) — the default `getSpeechFile` pipes `getSpeech` to a file.
 */
export abstract class BaseEngine {
  /** Short ID used in env / commands (e.g. "azure"). */
  readonly shortName: string;
  /** Human-readable name shown in messages. */
  readonly longName: string;
  /** Output file extension without the dot (e.g. "mp3"). */
  readonly fileExtension: string;

  protected voices: Record<string, string | { name: string; lang: string }> = {};

  constructor(shortName: string, longName: string, fileExtension: string) {
    this.shortName = shortName;
    this.longName = longName;
    this.fileExtension = fileExtension;
  }

  /** Maps a user-friendly voice name to the provider's internal identifier. */
  getInternalVoiceName(str: string): string {
    const v = this.voices[str];
    if (v == null) return str;
    return typeof v === "string" ? v : v.name;
  }

  abstract getDefaultVoice(): string;

  validateVoice(voice: string): boolean {
    if (Object.keys(this.voices).length === 0) return true;
    return this.voices[voice] != null;
  }

  /** Returns sorted lowercase voice names known to this engine. Empty if the engine accepts any voice. */
  listVoices(): string[] {
    return Object.keys(this.voices).sort();
  }

  /** True when this engine has no static voice table and accepts any voice string. */
  isFreeformVoice(): boolean {
    return Object.keys(this.voices).length === 0;
  }

  /**
   * Resolves a user-typed voice string against this engine's voice table using
   * exact match first, then token-prefix matching: each whitespace-separated
   * input token must be a prefix of some alphanumeric token of a key.
   * Freeform engines always succeed with the normalized input.
   */
  resolveVoice(input: string): VoiceResolution {
    const norm = input.trim().toLowerCase();
    if (this.isFreeformVoice()) return { kind: "exact", voice: norm };
    if (this.voices[norm] != null) return { kind: "exact", voice: norm };
    const inputTokens = norm.split(/\s+/).filter(Boolean);
    if (inputTokens.length === 0) return { kind: "none" };
    const matches = Object.keys(this.voices).filter((key) => {
      const keyTokens = key.split(/[^a-z0-9]+/i).filter(Boolean);
      return inputTokens.every((it) => keyTokens.some((kt) => kt.startsWith(it)));
    });
    if (matches.length === 1) return { kind: "fuzzy", voice: matches[0]! };
    if (matches.length > 1) return { kind: "ambiguous", candidates: matches.sort() };
    return { kind: "none" };
  }

  /** Default implementation: subclass should override either this or getSpeechFile. */
  async getSpeech(_text: string, _voice?: string, _params?: VoiceParams): Promise<Response> {
    throw new Error(`${this.shortName}: getSpeech not implemented`);
  }

  async getSpeechFile(
    text: string,
    filepath: string,
    voice: string = this.getDefaultVoice(),
    params: VoiceParams = {},
  ): Promise<string> {
    const data = await this.getSpeech(text, voice, params);
    const buf = Buffer.from(await data.arrayBuffer());
    await writeFile(filepath, buf);
    return filepath;
  }
}