2026-04-09 10:21:20 +02:00

68 lines
1.6 KiB
Go

package audiocap
import (
"context"
"fmt"
"os/exec"
texttospeech "cloud.google.com/go/texttospeech/apiv1"
"cloud.google.com/go/texttospeech/apiv1/texttospeechpb"
)
type Client struct {
client *texttospeech.Client
sink string // pulse sink to play into
}
func New(ctx context.Context, sink string) (*Client, error) {
c, err := texttospeech.NewClient(ctx)
if err != nil {
return nil, fmt.Errorf("tts client: %w", err)
}
return &Client{client: c, sink: sink}, nil
}
func (c *Client) Close() error {
return c.client.Close()
}
// Speak synthesizes text and plays it into the configured sink.
func (c *Client) Speak(ctx context.Context, text string) error {
req := &texttospeechpb.SynthesizeSpeechRequest{
Input: &texttospeechpb.SynthesisInput{
InputSource: &texttospeechpb.SynthesisInput_Text{Text: text},
},
Voice: &texttospeechpb.VoiceSelectionParams{
LanguageCode: "en-US",
Name: "en-US-Neural2-D",
},
AudioConfig: &texttospeechpb.AudioConfig{
AudioEncoding: texttospeechpb.AudioEncoding_LINEAR16,
SampleRateHertz: 24000,
},
}
resp, err := c.client.SynthesizeSpeech(ctx, req)
if err != nil {
return fmt.Errorf("synthesize: %w", err)
}
// LINEAR16 from Google TTS is raw PCM wrapped in a WAV header.
// paplay handles WAV directly.
cmd := exec.CommandContext(ctx, "paplay",
"--device="+c.sink,
)
stdin, err := cmd.StdinPipe()
if err != nil {
return fmt.Errorf("stdin: %w", err)
}
if err := cmd.Start(); err != nil {
return fmt.Errorf("paplay: %w", err)
}
if _, err := stdin.Write(resp.AudioContent); err != nil {
return fmt.Errorf("write: %w", err)
}
stdin.Close()
return cmd.Wait()
}