package audiocap import ( "context" "fmt" "os/exec" texttospeech "cloud.google.com/go/texttospeech/apiv1" "cloud.google.com/go/texttospeech/apiv1/texttospeechpb" ) type Client struct { client *texttospeech.Client sink string // pulse sink to play into } func New(ctx context.Context, sink string) (*Client, error) { c, err := texttospeech.NewClient(ctx) if err != nil { return nil, fmt.Errorf("tts client: %w", err) } return &Client{client: c, sink: sink}, nil } func (c *Client) Close() error { return c.client.Close() } // Speak synthesizes text and plays it into the configured sink. func (c *Client) Speak(ctx context.Context, text string) error { req := &texttospeechpb.SynthesizeSpeechRequest{ Input: &texttospeechpb.SynthesisInput{ InputSource: &texttospeechpb.SynthesisInput_Text{Text: text}, }, Voice: &texttospeechpb.VoiceSelectionParams{ LanguageCode: "en-US", Name: "en-US-Neural2-D", }, AudioConfig: &texttospeechpb.AudioConfig{ AudioEncoding: texttospeechpb.AudioEncoding_LINEAR16, SampleRateHertz: 24000, }, } resp, err := c.client.SynthesizeSpeech(ctx, req) if err != nil { return fmt.Errorf("synthesize: %w", err) } // LINEAR16 from Google TTS is raw PCM wrapped in a WAV header. // paplay handles WAV directly. cmd := exec.CommandContext(ctx, "paplay", "--device="+c.sink, ) stdin, err := cmd.StdinPipe() if err != nil { return fmt.Errorf("stdin: %w", err) } if err := cmd.Start(); err != nil { return fmt.Errorf("paplay: %w", err) } if _, err := stdin.Write(resp.AudioContent); err != nil { return fmt.Errorf("write: %w", err) } stdin.Close() return cmd.Wait() }