responder/pkg/audiocap/tts.go

package audiocap

import (
	"context"
	"fmt"
	"os/exec"

	texttospeech "cloud.google.com/go/texttospeech/apiv1"
	"cloud.google.com/go/texttospeech/apiv1/texttospeechpb"
)

type Client struct {
	client *texttospeech.Client
	sink   string // pulse sink to play into
}

func New(ctx context.Context, sink string) (*Client, error) {
	c, err := texttospeech.NewClient(ctx)
	if err != nil {
		return nil, fmt.Errorf("tts client: %w", err)
	}
	return &Client{client: c, sink: sink}, nil
}

func (c *Client) Close() error {
	return c.client.Close()
}

// Speak synthesizes text and plays it into the configured sink.
func (c *Client) Speak(ctx context.Context, text string) error {
	req := &texttospeechpb.SynthesizeSpeechRequest{
		Input: &texttospeechpb.SynthesisInput{
			InputSource: &texttospeechpb.SynthesisInput_Text{Text: text},
		},
		Voice: &texttospeechpb.VoiceSelectionParams{
			LanguageCode: "en-US",
			Name:         "en-US-Neural2-D",
		},
		AudioConfig: &texttospeechpb.AudioConfig{
			AudioEncoding:   texttospeechpb.AudioEncoding_LINEAR16,
			SampleRateHertz: 24000,
		},
	}

	resp, err := c.client.SynthesizeSpeech(ctx, req)
	if err != nil {
		return fmt.Errorf("synthesize: %w", err)
	}

	// LINEAR16 from Google TTS is raw PCM wrapped in a WAV header.
	// paplay handles WAV directly.
	cmd := exec.CommandContext(ctx, "paplay",
		"--device="+c.sink,
	)
	stdin, err := cmd.StdinPipe()
	if err != nil {
		return fmt.Errorf("stdin: %w", err)
	}
	if err := cmd.Start(); err != nil {
		return fmt.Errorf("paplay: %w", err)
	}
	if _, err := stdin.Write(resp.AudioContent); err != nil {
		return fmt.Errorf("write: %w", err)
	}
	stdin.Close()
	return cmd.Wait()
}