Fetch voices from Kokoro API at runtime; replace select with styled voice card grid

2026-03-02 10:18:43 +05:00
parent 2107b6e6b8
commit 7589866965
2 changed files with 162 additions and 10 deletions
--- a/scraper/internal/server/server.go
+++ b/scraper/internal/server/server.go
@@ -38,6 +38,10 @@ type Server struct {
 	kokoroURL      string // Kokoro-FastAPI base URL, e.g. http://kokoro:8880
 	kokoroVoice    string // default voice, e.g. af_bella

+	// voiceMu guards cachedVoices.
+	voiceMu      sync.RWMutex
+	cachedVoices []string // populated on first request from Kokoro /v1/audio/voices
+
 	// audioMu guards audioCache and audioInFlight.
 	// audioCache maps a cache key to the Kokoro download filename returned by
 	// POST /v1/audio/speech with return_download_link=true.
@@ -62,6 +66,45 @@ func New(addr string, oCfg orchestrator.Config, novel scraper.NovelScraper, log
 	}
 }

+// voices returns the list of available Kokoro voices.  On the first call it
+// fetches GET /v1/audio/voices from the Kokoro service and caches the result.
+// If the fetch fails (Kokoro not up yet, network error, etc.) it falls back to
+// the hardcoded kokoroVoices list so the UI is never empty.
+func (s *Server) voices() []string {
+	s.voiceMu.RLock()
+	cached := s.cachedVoices
+	s.voiceMu.RUnlock()
+	if len(cached) > 0 {
+		return cached
+	}
+
+	if s.kokoroURL != "" {
+		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+		defer cancel()
+		req, err := http.NewRequestWithContext(ctx, http.MethodGet, s.kokoroURL+"/v1/audio/voices", nil)
+		if err == nil {
+			req.Header.Set("Accept", "application/json")
+			resp, err := http.DefaultClient.Do(req)
+			if err == nil {
+				defer resp.Body.Close()
+				var payload struct {
+					Voices []string `json:"voices"`
+				}
+				if resp.StatusCode == http.StatusOK && json.NewDecoder(resp.Body).Decode(&payload) == nil && len(payload.Voices) > 0 {
+					s.voiceMu.Lock()
+					s.cachedVoices = payload.Voices
+					s.voiceMu.Unlock()
+					s.log.Info("fetched kokoro voices", "count", len(payload.Voices))
+					return payload.Voices
+				}
+			}
+		}
+		s.log.Warn("could not fetch kokoro voices, using built-in list")
+	}
+
+	return kokoroVoices
+}
+
 // ListenAndServe starts the HTTP server and blocks until the provided context
 // is cancelled.
 func (s *Server) ListenAndServe(ctx context.Context) error {
--- a/scraper/internal/server/ui.go
+++ b/scraper/internal/server/ui.go
@@ -65,6 +65,64 @@ var kokoroVoices = []string{
 	"zm_yunjian", "zm_yunxi", "zm_yunxia", "zm_yunyang",
 }

+// voiceInfo holds the parsed display metadata for a single Kokoro voice.
+type voiceInfo struct {
+	ID     string // raw voice ID, e.g. "af_bella"
+	Name   string // display name, e.g. "Bella"
+	Lang   string // language label, e.g. "EN-US"
+	Gender string // "F" or "M"
+}
+
+// langLabel maps the two-letter prefix to a human-readable language tag.
+var langLabel = map[string]string{
+	"a": "EN-US",
+	"b": "EN-GB",
+	"e": "ES",
+	"f": "FR",
+	"h": "HI",
+	"i": "IT",
+	"j": "JA",
+	"p": "PT",
+	"z": "ZH",
+}
+
+// parseVoice decodes a Kokoro voice ID into display metadata.
+// IDs follow the pattern  {lang}{gender}_{name}  e.g. "af_bella".
+func parseVoice(id string) voiceInfo {
+	v := voiceInfo{ID: id, Name: id, Lang: "?", Gender: "?"}
+	if len(id) < 3 || id[2] != '_' {
+		return v
+	}
+	lc := string(id[0])
+	gc := string(id[1])
+	name := id[3:]
+	if l, ok := langLabel[lc]; ok {
+		v.Lang = l
+	}
+	switch gc {
+	case "f":
+		v.Gender = "F"
+	case "m":
+		v.Gender = "M"
+	}
+	// Capitalise name, replace underscores with spaces.
+	if len(name) > 0 {
+		runes := []rune(name)
+		runes[0] -= 'a' - 'A'
+		v.Name = strings.ReplaceAll(string(runes), "_", " ")
+	}
+	return v
+}
+
+// parseVoices converts a slice of raw voice IDs to voiceInfo structs.
+func parseVoices(ids []string) []voiceInfo {
+	out := make([]voiceInfo, len(ids))
+	for i, id := range ids {
+		out[i] = parseVoice(id)
+	}
+	return out
+}
+
 // ─── shared layout ────────────────────────────────────────────────────────────

 const layoutHead = `<!DOCTYPE html>
@@ -1932,16 +1990,32 @@ const chapterTmpl = `
       role="dialog"
       aria-label="Reader settings"
       hidden
-       class="absolute right-[max(0.5rem,calc(50%-32rem+0.5rem))] bottom-[calc(100%+0.25rem)] min-w-[260px] bg-zinc-900 border border-zinc-800 rounded-xl p-4 shadow-2xl z-[100]">
-    <label class="block mb-3.5">
-      <span class="block text-xs text-zinc-500 mb-1.5">Voice</span>
-      <select id="tts-voice"
-              class="w-full rounded-lg bg-zinc-800 border border-zinc-700 px-2 py-1.5 text-sm text-zinc-200 outline-none">
+       class="absolute right-[max(0.5rem,calc(50%-32rem+0.5rem))] bottom-[calc(100%+0.25rem)] w-[min(320px,calc(100vw-1rem))] bg-zinc-900 border border-zinc-800 rounded-xl p-4 shadow-2xl z-[100]">
+    <!-- hidden native select keeps existing JS working unchanged -->
+    <select id="tts-voice" class="sr-only" aria-hidden="true" tabindex="-1">
+      {{range .Voices}}
+      <option value="{{.ID}}"{{if eq .ID $.DefaultVoice}} selected{{end}}>{{.Name}}</option>
+      {{end}}
+    </select>
+
+    <div class="mb-3.5">
+      <span class="block text-xs text-zinc-500 mb-2">Voice</span>
+      <div id="voice-grid" class="grid grid-cols-2 gap-1.5 max-h-48 overflow-y-auto pr-0.5">
        {{range .Voices}}
-        <option value="{{.}}"{{if eq . $.DefaultVoice}} selected{{end}}>{{.}}</option>
+        <button type="button"
+                data-voice="{{.ID}}"
+                onclick="selectVoice(this)"
+                class="voice-btn flex items-center gap-2 px-2.5 py-1.5 rounded-lg border text-left transition-colors
+                       {{if eq .ID $.DefaultVoice}}border-amber-500 bg-amber-500/10 text-amber-300{{else}}border-zinc-700 bg-zinc-800 text-zinc-300 hover:border-zinc-500 hover:text-zinc-100{{end}}">
+          <span class="flex-1 min-w-0">
+            <span class="block text-[0.8rem] font-medium leading-tight truncate">{{.Name}}</span>
+            <span class="block text-[0.65rem] text-zinc-500 leading-tight">{{.Lang}} · {{.Gender}}</span>
+          </span>
+        </button>
        {{end}}
-      </select>
-    </label>
+      </div>
+    </div>
+
    <label class="block mb-3.5">
      <span class="block text-xs text-zinc-500 mb-1.5">Speed — <span id="tts-speed-label">1.0×</span></span>
      <input id="tts-speed" type="range"
@@ -2620,6 +2694,41 @@ const chapterTmpl = `
    document.addEventListener('touchend', window.__ttsDoubleTap, { passive: true });
  }());
 }());
+
+// ── Voice card picker ─────────────────────────────────────────────────────────
+window.selectVoice = function (btn) {
+  var voiceSel = document.getElementById('tts-voice');
+  var grid     = document.getElementById('voice-grid');
+  if (!voiceSel || !grid) return;
+
+  // Update hidden select so voiceSel.value works in the existing TTS code.
+  voiceSel.value = btn.dataset.voice;
+  // Persist to localStorage using same key as the TTS IIFE.
+  try { localStorage.setItem('tts_voice', btn.dataset.voice); } catch(_) {}
+
+  // Swap active styling across all cards.
+  grid.querySelectorAll('.voice-btn').forEach(function (b) {
+    var active = b === btn;
+    b.classList.toggle('border-amber-500', active);
+    b.classList.toggle('bg-amber-500/10',  active);
+    b.classList.toggle('text-amber-300',   active);
+    b.classList.toggle('border-zinc-700',  !active);
+    b.classList.toggle('bg-zinc-800',      !active);
+    b.classList.toggle('text-zinc-300',    !active);
+  });
+};
+
+// On page load, sync voice grid selection to the restored localStorage value.
+(function syncVoiceGrid() {
+  var voiceSel = document.getElementById('tts-voice');
+  var grid     = document.getElementById('voice-grid');
+  if (!voiceSel || !grid) return;
+  var saved = null;
+  try { saved = localStorage.getItem('tts_voice'); } catch(_) {}
+  if (!saved) return;
+  var btn = grid.querySelector('[data-voice="' + saved + '"]');
+  if (btn) window.selectVoice(btn);
+})();
 </script>`

 func (s *Server) handleChapter(w http.ResponseWriter, r *http.Request) {
@@ -2665,7 +2774,7 @@ func (s *Server) handleChapter(w http.ResponseWriter, r *http.Request) {
 		Title        string
 		ChapterDate  string
 		AllChapters  interface{}
-		Voices       []string
+		Voices       []voiceInfo
 		DefaultVoice string
 		Cover        string
 	}{
@@ -2677,7 +2786,7 @@ func (s *Server) handleChapter(w http.ResponseWriter, r *http.Request) {
 		Title:        chapterTitle,
 		ChapterDate:  chapterDate,
 		AllChapters:  chapters,
-		Voices:       kokoroVoices,
+		Voices:       parseVoices(s.voices()),
 		DefaultVoice: s.kokoroVoice,
 		Cover:        coverURL,
 	})