libnovel/backend/internal/cfai/text.go

// Text generation via Cloudflare Workers AI LLM models.
//
// API reference:
//
//	POST https://api.cloudflare.com/client/v4/accounts/{accountID}/ai/run/{model}
//	Authorization: Bearer {apiToken}
//	Content-Type:  application/json
//
// Request body (all models):
//
//	{ "messages": [{"role":"system","content":"..."},{"role":"user","content":"..."}] }
//
// Response (wrapped):
//
//	{ "result": { "response": "..." }, "success": true }
package cfai

import (
	"bytes"
	"context"
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"time"
)

// TextModel identifies a Cloudflare Workers AI text generation model.
type TextModel string

const (
	// TextModelGemma4          — Google Gemma 4, 256k context.
	TextModelGemma4 TextModel = "@cf/google/gemma-4-26b-a4b-it"
	// TextModelLlama4Scout     — Meta Llama 4 Scout 17B, multimodal.
	TextModelLlama4Scout TextModel = "@cf/meta/llama-4-scout-17b-16e-instruct"
	// TextModelLlama33_70B     — Meta Llama 3.3 70B, fast fp8.
	TextModelLlama33_70B TextModel = "@cf/meta/llama-3.3-70b-instruct-fp8-fast"
	// TextModelQwen3_30B       — Qwen3 30B MoE, function calling.
	TextModelQwen3_30B TextModel = "@cf/qwen/qwen3-30b-a3b-fp8"
	// TextModelMistralSmall    — Mistral Small 3.1 24B, 128k context.
	TextModelMistralSmall TextModel = "@cf/mistralai/mistral-small-3.1-24b-instruct"
	// TextModelQwQ32B          — Qwen QwQ 32B reasoning model.
	TextModelQwQ32B TextModel = "@cf/qwen/qwq-32b"
	// TextModelDeepSeekR1      — DeepSeek R1 distill Qwen 32B.
	TextModelDeepSeekR1 TextModel = "@cf/deepseek-ai/deepseek-r1-distill-qwen-32b"
	// TextModelGemma3_12B      — Google Gemma 3 12B, 80k context.
	TextModelGemma3_12B TextModel = "@cf/google/gemma-3-12b-it"
	// TextModelGPTOSS120B      — OpenAI gpt-oss-120b, high reasoning.
	TextModelGPTOSS120B TextModel = "@cf/openai/gpt-oss-120b"
	// TextModelGPTOSS20B       — OpenAI gpt-oss-20b, lower latency.
	TextModelGPTOSS20B TextModel = "@cf/openai/gpt-oss-20b"
	// TextModelNemotron3       — NVIDIA Nemotron 3 120B, agentic.
	TextModelNemotron3 TextModel = "@cf/nvidia/nemotron-3-120b-a12b"
	// TextModelLlama32_3B      — Meta Llama 3.2 3B, lightweight.
	TextModelLlama32_3B TextModel = "@cf/meta/llama-3.2-3b-instruct"

	// DefaultTextModel is the default model used when none is specified.
	DefaultTextModel = TextModelLlama4Scout
)

// TextModelInfo describes a single text generation model.
type TextModelInfo struct {
	ID          string `json:"id"`
	Label       string `json:"label"`
	Provider    string `json:"provider"`
	ContextSize int    `json:"context_size"` // max context in tokens
	Description string `json:"description"`
}

// AllTextModels returns metadata about every supported text generation model.
func AllTextModels() []TextModelInfo {
	return []TextModelInfo{
		{
			ID: string(TextModelGemma4), Label: "Gemma 4 26B", Provider: "Google",
			ContextSize: 256000,
			Description: "Google's most intelligent open model family. 256k context, function calling.",
		},
		{
			ID: string(TextModelLlama4Scout), Label: "Llama 4 Scout 17B", Provider: "Meta",
			ContextSize: 131000,
			Description: "Natively multimodal, 16 experts. Good all-purpose model with function calling.",
		},
		{
			ID: string(TextModelLlama33_70B), Label: "Llama 3.3 70B (fp8 fast)", Provider: "Meta",
			ContextSize: 24000,
			Description: "Llama 3.3 70B quantized to fp8 for speed. Excellent instruction following.",
		},
		{
			ID: string(TextModelQwen3_30B), Label: "Qwen3 30B MoE", Provider: "Qwen",
			ContextSize: 32768,
			Description: "MoE architecture with strong reasoning and instruction following.",
		},
		{
			ID: string(TextModelMistralSmall), Label: "Mistral Small 3.1 24B", Provider: "MistralAI",
			ContextSize: 128000,
			Description: "Strong text performance with 128k context and function calling.",
		},
		{
			ID: string(TextModelQwQ32B), Label: "QwQ 32B (reasoning)", Provider: "Qwen",
			ContextSize: 24000,
			Description: "Reasoning model — thinks before answering. Slower but more accurate.",
		},
		{
			ID: string(TextModelDeepSeekR1), Label: "DeepSeek R1 32B", Provider: "DeepSeek",
			ContextSize: 80000,
			Description: "R1-distilled reasoning model. Outperforms o1-mini on many benchmarks.",
		},
		{
			ID: string(TextModelGemma3_12B), Label: "Gemma 3 12B", Provider: "Google",
			ContextSize: 80000,
			Description: "Multimodal, 128k context, multilingual (140+ languages).",
		},
		{
			ID: string(TextModelGPTOSS120B), Label: "GPT-OSS 120B", Provider: "OpenAI",
			ContextSize: 128000,
			Description: "OpenAI open-weight model for production, general purpose, high reasoning.",
		},
		{
			ID: string(TextModelGPTOSS20B), Label: "GPT-OSS 20B", Provider: "OpenAI",
			ContextSize: 128000,
			Description: "OpenAI open-weight model for lower latency and specialized use cases.",
		},
		{
			ID: string(TextModelNemotron3), Label: "Nemotron 3 120B", Provider: "NVIDIA",
			ContextSize: 256000,
			Description: "Hybrid MoE with leading accuracy for multi-agent applications.",
		},
		{
			ID: string(TextModelLlama32_3B), Label: "Llama 3.2 3B", Provider: "Meta",
			ContextSize: 80000,
			Description: "Lightweight model for simple tasks. Fast and cheap.",
		},
	}
}

// TextMessage is a single message in a chat conversation.
type TextMessage struct {
	Role    string `json:"role"`    // "system" or "user"
	Content string `json:"content"` // message text
}

// TextRequest is the input to Generate.
type TextRequest struct {
	// Model is the CF Workers AI model ID. Defaults to DefaultTextModel when empty.
	Model TextModel
	// Messages is the conversation history (system + user messages).
	Messages []TextMessage
	// MaxTokens limits the output length (0 = model default).
	MaxTokens int
}

// TextGenClient generates text via Cloudflare Workers AI LLM models.
type TextGenClient interface {
	// Generate sends a chat-style request and returns the model's response text.
	Generate(ctx context.Context, req TextRequest) (string, error)

	// Models returns metadata about all supported text generation models.
	Models() []TextModelInfo
}

// textGenHTTPClient is the concrete CF AI text generation client.
type textGenHTTPClient struct {
	accountID string
	apiToken  string
	http      *http.Client
}

// NewTextGen returns a TextGenClient for the given Cloudflare account.
func NewTextGen(accountID, apiToken string) TextGenClient {
	return &textGenHTTPClient{
		accountID: accountID,
		apiToken:  apiToken,
		http:      &http.Client{Timeout: 5 * time.Minute},
	}
}

// Generate sends messages to the model and returns the response text.
func (c *textGenHTTPClient) Generate(ctx context.Context, req TextRequest) (string, error) {
	if req.Model == "" {
		req.Model = DefaultTextModel
	}

	body := map[string]any{
		"messages": req.Messages,
	}
	if req.MaxTokens > 0 {
		body["max_tokens"] = req.MaxTokens
	}

	encoded, err := json.Marshal(body)
	if err != nil {
		return "", fmt.Errorf("cfai/text: marshal: %w", err)
	}

	url := fmt.Sprintf("https://api.cloudflare.com/client/v4/accounts/%s/ai/run/%s",
		c.accountID, string(req.Model))
	httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(encoded))
	if err != nil {
		return "", fmt.Errorf("cfai/text: build request: %w", err)
	}
	httpReq.Header.Set("Authorization", "Bearer "+c.apiToken)
	httpReq.Header.Set("Content-Type", "application/json")

	resp, err := c.http.Do(httpReq)
	if err != nil {
		return "", fmt.Errorf("cfai/text: http: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		errBody, _ := io.ReadAll(resp.Body)
		msg := string(errBody)
		if len(msg) > 300 {
			msg = msg[:300]
		}
		return "", fmt.Errorf("cfai/text: model %s returned %d: %s", req.Model, resp.StatusCode, msg)
	}

	// CF AI wraps responses: { "result": { "response": "..." }, "success": true }
	// Some models (e.g. Llama 4 Scout) return response as an array:
	// { "result": { "response": [{"generated_text":"..."}] } }
	var wrapper struct {
		Result struct {
			Response json.RawMessage `json:"response"`
		} `json:"result"`
		Success bool     `json:"success"`
		Errors  []string `json:"errors"`
	}
	if err := json.NewDecoder(resp.Body).Decode(&wrapper); err != nil {
		return "", fmt.Errorf("cfai/text: decode response: %w", err)
	}
	if !wrapper.Success {
		return "", fmt.Errorf("cfai/text: model %s error: %v", req.Model, wrapper.Errors)
	}
	// Try plain string first.
	var text string
	if err := json.Unmarshal(wrapper.Result.Response, &text); err == nil {
		return text, nil
	}
	// Fall back: array of objects with a "generated_text" field.
	var arr []struct {
		GeneratedText string `json:"generated_text"`
	}
	if err := json.Unmarshal(wrapper.Result.Response, &arr); err == nil && len(arr) > 0 {
		return arr[0].GeneratedText, nil
	}
	return "", fmt.Errorf("cfai/text: model %s: unrecognised response shape: %s", req.Model, wrapper.Result.Response)
}

// Models returns all supported text generation model metadata.
func (c *textGenHTTPClient) Models() []TextModelInfo {
	return AllTextModels()
}