Llama 4 Scout returns `result.response` as an array of objects
[{"generated_text":"..."}] instead of a plain string. Decode into
json.RawMessage and try both shapes; fall back to generated_text[0].
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
254 lines
8.8 KiB
Go
254 lines
8.8 KiB
Go
// Text generation via Cloudflare Workers AI LLM models.
|
|
//
|
|
// API reference:
|
|
//
|
|
// POST https://api.cloudflare.com/client/v4/accounts/{accountID}/ai/run/{model}
|
|
// Authorization: Bearer {apiToken}
|
|
// Content-Type: application/json
|
|
//
|
|
// Request body (all models):
|
|
//
|
|
// { "messages": [{"role":"system","content":"..."},{"role":"user","content":"..."}] }
|
|
//
|
|
// Response (wrapped):
|
|
//
|
|
// { "result": { "response": "..." }, "success": true }
|
|
package cfai
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"time"
|
|
)
|
|
|
|
// TextModel identifies a Cloudflare Workers AI text generation model.
|
|
type TextModel string
|
|
|
|
const (
|
|
// TextModelGemma4 — Google Gemma 4, 256k context.
|
|
TextModelGemma4 TextModel = "@cf/google/gemma-4-26b-a4b-it"
|
|
// TextModelLlama4Scout — Meta Llama 4 Scout 17B, multimodal.
|
|
TextModelLlama4Scout TextModel = "@cf/meta/llama-4-scout-17b-16e-instruct"
|
|
// TextModelLlama33_70B — Meta Llama 3.3 70B, fast fp8.
|
|
TextModelLlama33_70B TextModel = "@cf/meta/llama-3.3-70b-instruct-fp8-fast"
|
|
// TextModelQwen3_30B — Qwen3 30B MoE, function calling.
|
|
TextModelQwen3_30B TextModel = "@cf/qwen/qwen3-30b-a3b-fp8"
|
|
// TextModelMistralSmall — Mistral Small 3.1 24B, 128k context.
|
|
TextModelMistralSmall TextModel = "@cf/mistralai/mistral-small-3.1-24b-instruct"
|
|
// TextModelQwQ32B — Qwen QwQ 32B reasoning model.
|
|
TextModelQwQ32B TextModel = "@cf/qwen/qwq-32b"
|
|
// TextModelDeepSeekR1 — DeepSeek R1 distill Qwen 32B.
|
|
TextModelDeepSeekR1 TextModel = "@cf/deepseek-ai/deepseek-r1-distill-qwen-32b"
|
|
// TextModelGemma3_12B — Google Gemma 3 12B, 80k context.
|
|
TextModelGemma3_12B TextModel = "@cf/google/gemma-3-12b-it"
|
|
// TextModelGPTOSS120B — OpenAI gpt-oss-120b, high reasoning.
|
|
TextModelGPTOSS120B TextModel = "@cf/openai/gpt-oss-120b"
|
|
// TextModelGPTOSS20B — OpenAI gpt-oss-20b, lower latency.
|
|
TextModelGPTOSS20B TextModel = "@cf/openai/gpt-oss-20b"
|
|
// TextModelNemotron3 — NVIDIA Nemotron 3 120B, agentic.
|
|
TextModelNemotron3 TextModel = "@cf/nvidia/nemotron-3-120b-a12b"
|
|
// TextModelLlama32_3B — Meta Llama 3.2 3B, lightweight.
|
|
TextModelLlama32_3B TextModel = "@cf/meta/llama-3.2-3b-instruct"
|
|
|
|
// DefaultTextModel is the default model used when none is specified.
|
|
DefaultTextModel = TextModelLlama4Scout
|
|
)
|
|
|
|
// TextModelInfo describes a single text generation model.
|
|
type TextModelInfo struct {
|
|
ID string `json:"id"`
|
|
Label string `json:"label"`
|
|
Provider string `json:"provider"`
|
|
ContextSize int `json:"context_size"` // max context in tokens
|
|
Description string `json:"description"`
|
|
}
|
|
|
|
// AllTextModels returns metadata about every supported text generation model.
|
|
func AllTextModels() []TextModelInfo {
|
|
return []TextModelInfo{
|
|
{
|
|
ID: string(TextModelGemma4), Label: "Gemma 4 26B", Provider: "Google",
|
|
ContextSize: 256000,
|
|
Description: "Google's most intelligent open model family. 256k context, function calling.",
|
|
},
|
|
{
|
|
ID: string(TextModelLlama4Scout), Label: "Llama 4 Scout 17B", Provider: "Meta",
|
|
ContextSize: 131000,
|
|
Description: "Natively multimodal, 16 experts. Good all-purpose model with function calling.",
|
|
},
|
|
{
|
|
ID: string(TextModelLlama33_70B), Label: "Llama 3.3 70B (fp8 fast)", Provider: "Meta",
|
|
ContextSize: 24000,
|
|
Description: "Llama 3.3 70B quantized to fp8 for speed. Excellent instruction following.",
|
|
},
|
|
{
|
|
ID: string(TextModelQwen3_30B), Label: "Qwen3 30B MoE", Provider: "Qwen",
|
|
ContextSize: 32768,
|
|
Description: "MoE architecture with strong reasoning and instruction following.",
|
|
},
|
|
{
|
|
ID: string(TextModelMistralSmall), Label: "Mistral Small 3.1 24B", Provider: "MistralAI",
|
|
ContextSize: 128000,
|
|
Description: "Strong text performance with 128k context and function calling.",
|
|
},
|
|
{
|
|
ID: string(TextModelQwQ32B), Label: "QwQ 32B (reasoning)", Provider: "Qwen",
|
|
ContextSize: 24000,
|
|
Description: "Reasoning model — thinks before answering. Slower but more accurate.",
|
|
},
|
|
{
|
|
ID: string(TextModelDeepSeekR1), Label: "DeepSeek R1 32B", Provider: "DeepSeek",
|
|
ContextSize: 80000,
|
|
Description: "R1-distilled reasoning model. Outperforms o1-mini on many benchmarks.",
|
|
},
|
|
{
|
|
ID: string(TextModelGemma3_12B), Label: "Gemma 3 12B", Provider: "Google",
|
|
ContextSize: 80000,
|
|
Description: "Multimodal, 128k context, multilingual (140+ languages).",
|
|
},
|
|
{
|
|
ID: string(TextModelGPTOSS120B), Label: "GPT-OSS 120B", Provider: "OpenAI",
|
|
ContextSize: 128000,
|
|
Description: "OpenAI open-weight model for production, general purpose, high reasoning.",
|
|
},
|
|
{
|
|
ID: string(TextModelGPTOSS20B), Label: "GPT-OSS 20B", Provider: "OpenAI",
|
|
ContextSize: 128000,
|
|
Description: "OpenAI open-weight model for lower latency and specialized use cases.",
|
|
},
|
|
{
|
|
ID: string(TextModelNemotron3), Label: "Nemotron 3 120B", Provider: "NVIDIA",
|
|
ContextSize: 256000,
|
|
Description: "Hybrid MoE with leading accuracy for multi-agent applications.",
|
|
},
|
|
{
|
|
ID: string(TextModelLlama32_3B), Label: "Llama 3.2 3B", Provider: "Meta",
|
|
ContextSize: 80000,
|
|
Description: "Lightweight model for simple tasks. Fast and cheap.",
|
|
},
|
|
}
|
|
}
|
|
|
|
// TextMessage is a single message in a chat conversation.
|
|
type TextMessage struct {
|
|
Role string `json:"role"` // "system" or "user"
|
|
Content string `json:"content"` // message text
|
|
}
|
|
|
|
// TextRequest is the input to Generate.
|
|
type TextRequest struct {
|
|
// Model is the CF Workers AI model ID. Defaults to DefaultTextModel when empty.
|
|
Model TextModel
|
|
// Messages is the conversation history (system + user messages).
|
|
Messages []TextMessage
|
|
// MaxTokens limits the output length (0 = model default).
|
|
MaxTokens int
|
|
}
|
|
|
|
// TextGenClient generates text via Cloudflare Workers AI LLM models.
|
|
type TextGenClient interface {
|
|
// Generate sends a chat-style request and returns the model's response text.
|
|
Generate(ctx context.Context, req TextRequest) (string, error)
|
|
|
|
// Models returns metadata about all supported text generation models.
|
|
Models() []TextModelInfo
|
|
}
|
|
|
|
// textGenHTTPClient is the concrete CF AI text generation client.
|
|
type textGenHTTPClient struct {
|
|
accountID string
|
|
apiToken string
|
|
http *http.Client
|
|
}
|
|
|
|
// NewTextGen returns a TextGenClient for the given Cloudflare account.
|
|
func NewTextGen(accountID, apiToken string) TextGenClient {
|
|
return &textGenHTTPClient{
|
|
accountID: accountID,
|
|
apiToken: apiToken,
|
|
http: &http.Client{Timeout: 5 * time.Minute},
|
|
}
|
|
}
|
|
|
|
// Generate sends messages to the model and returns the response text.
|
|
func (c *textGenHTTPClient) Generate(ctx context.Context, req TextRequest) (string, error) {
|
|
if req.Model == "" {
|
|
req.Model = DefaultTextModel
|
|
}
|
|
|
|
body := map[string]any{
|
|
"messages": req.Messages,
|
|
}
|
|
if req.MaxTokens > 0 {
|
|
body["max_tokens"] = req.MaxTokens
|
|
}
|
|
|
|
encoded, err := json.Marshal(body)
|
|
if err != nil {
|
|
return "", fmt.Errorf("cfai/text: marshal: %w", err)
|
|
}
|
|
|
|
url := fmt.Sprintf("https://api.cloudflare.com/client/v4/accounts/%s/ai/run/%s",
|
|
c.accountID, string(req.Model))
|
|
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(encoded))
|
|
if err != nil {
|
|
return "", fmt.Errorf("cfai/text: build request: %w", err)
|
|
}
|
|
httpReq.Header.Set("Authorization", "Bearer "+c.apiToken)
|
|
httpReq.Header.Set("Content-Type", "application/json")
|
|
|
|
resp, err := c.http.Do(httpReq)
|
|
if err != nil {
|
|
return "", fmt.Errorf("cfai/text: http: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
errBody, _ := io.ReadAll(resp.Body)
|
|
msg := string(errBody)
|
|
if len(msg) > 300 {
|
|
msg = msg[:300]
|
|
}
|
|
return "", fmt.Errorf("cfai/text: model %s returned %d: %s", req.Model, resp.StatusCode, msg)
|
|
}
|
|
|
|
// CF AI wraps responses: { "result": { "response": "..." }, "success": true }
|
|
// Some models (e.g. Llama 4 Scout) return response as an array:
|
|
// { "result": { "response": [{"generated_text":"..."}] } }
|
|
var wrapper struct {
|
|
Result struct {
|
|
Response json.RawMessage `json:"response"`
|
|
} `json:"result"`
|
|
Success bool `json:"success"`
|
|
Errors []string `json:"errors"`
|
|
}
|
|
if err := json.NewDecoder(resp.Body).Decode(&wrapper); err != nil {
|
|
return "", fmt.Errorf("cfai/text: decode response: %w", err)
|
|
}
|
|
if !wrapper.Success {
|
|
return "", fmt.Errorf("cfai/text: model %s error: %v", req.Model, wrapper.Errors)
|
|
}
|
|
// Try plain string first.
|
|
var text string
|
|
if err := json.Unmarshal(wrapper.Result.Response, &text); err == nil {
|
|
return text, nil
|
|
}
|
|
// Fall back: array of objects with a "generated_text" field.
|
|
var arr []struct {
|
|
GeneratedText string `json:"generated_text"`
|
|
}
|
|
if err := json.Unmarshal(wrapper.Result.Response, &arr); err == nil && len(arr) > 0 {
|
|
return arr[0].GeneratedText, nil
|
|
}
|
|
return "", fmt.Errorf("cfai/text: model %s: unrecognised response shape: %s", req.Model, wrapper.Result.Response)
|
|
}
|
|
|
|
// Models returns all supported text generation model metadata.
|
|
func (c *textGenHTTPClient) Models() []TextModelInfo {
|
|
return AllTextModels()
|
|
}
|