def create_parser() -> argparse.ArgumentParser:
"""Create and return the CLI argument parser.
This function is separated to allow documentation generation tools
to extract CLI arguments automatically.
Returns:
ArgumentParser configured with all Supertonic CLI commands
"""
parser = argparse.ArgumentParser(
prog="supertonic",
description="Supertonic - High-quality Text-to-Speech synthesis",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Generate and play speech directly (no file saved)
supertonic say 'Hello, welcome to the world!'
# Generate speech from text and save to file
supertonic tts 'Hello, welcome to the world!' -o output.wav
# Use different voice and quality
supertonic say 'This is a female voice style.' --voice F1 --steps 10
supertonic tts 'This is a female voice style.' -o hello.wav --voice F1 --steps 10
# Multilingual support (supertonic-3 covers 31 languages — see --lang choices)
supertonic say '안녕하세요! 반갑습니다.' --lang ko
supertonic tts 'Bonjour le monde!' -o french.wav --lang fr
supertonic tts 'Hola, bienvenido!' -o spanish.wav --lang es
# Unknown / unsupported language fallback (supertonic-3)
supertonic say 'Some uncommon text' --lang na
# Use custom voice style from JSON file
supertonic say 'This is a custom voice test.' --custom-style-path ./my_voice.json
# Long text with custom chunking
supertonic tts 'This is a very long text.' -o output.wav --max-chunk-length 200
# List available voices
supertonic list-voices
""",
)
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# Common arguments helper function
def add_common_args(p):
p.add_argument(
"-v",
"--verbose",
action="store_true",
help="Enable verbose output with detailed logging",
)
# Say command (play audio directly without saving)
parser_say = subparsers.add_parser(
"say", help="Generate speech and play it directly without saving a file"
)
parser_say.add_argument("text", help="Text to synthesize and play")
parser_say.add_argument(
"--model",
type=str,
default=DEFAULT_MODEL,
choices=AVAILABLE_MODELS,
help=(
"Model to use: supertonic (English only), supertonic-2 (5 languages), "
f"or supertonic-3 (31 languages + 'na' fallback). Default: {DEFAULT_MODEL}"
),
)
parser_say.add_argument("--voice", default="M1", help="Voice style (default: M1)")
parser_say.add_argument(
"--custom-style-path",
type=str,
default=None,
help="Path to custom voice style JSON file (overrides --voice if provided)",
)
parser_say.add_argument(
"--lang",
type=str,
default=None,
choices=AVAILABLE_LANGUAGES,
metavar="LANG",
help=(
"Language code (supertonic-3): "
"en, ko, ja, ar, bg, cs, da, de, el, es, et, fi, fr, hi, hr, hu, "
"id, it, lt, lv, nl, pl, pt, ro, ru, sk, sl, sv, tr, uk, vi, "
"or 'na' for unknown / unsupported languages. "
"Default: 'na' for multilingual models (supertonic-2/3), "
"'en' for supertonic v1."
),
)
parser_say.add_argument(
"--steps", type=int, default=8, help="Quality steps (default: 8, higher=better)"
)
parser_say.add_argument(
"--speed",
type=float,
default=1.05,
help="Speech speed (0.7-2.0, default: 1.05, 2.0=2x faster)",
)
parser_say.add_argument(
"--max-chunk-length",
type=int,
default=None,
help="Maximum characters per chunk (default: auto based on language)",
)
parser_say.add_argument(
"--silence-duration",
type=float,
default=0.3,
help="Silence between chunks in seconds (default: 0.3)",
)
add_common_args(parser_say)
parser_say.set_defaults(func=cmd_say)
# TTS command
parser_tts = subparsers.add_parser("tts", aliases=["t"], help="Generate speech from text")
parser_tts.add_argument("text", help="Text to synthesize")
parser_tts.add_argument("-o", "--output", required=True, help="Output WAV file")
parser_tts.add_argument(
"--model",
type=str,
default=DEFAULT_MODEL,
choices=AVAILABLE_MODELS,
help=(
"Model to use: supertonic (English only), supertonic-2 (5 languages), "
f"or supertonic-3 (31 languages + 'na' fallback). Default: {DEFAULT_MODEL}"
),
)
parser_tts.add_argument("--voice", default="M1", help="Voice style (default: M1)")
parser_tts.add_argument(
"--custom-style-path",
type=str,
default=None,
help="Path to custom voice style JSON file (overrides --voice if provided)",
)
parser_tts.add_argument(
"--lang",
type=str,
default=None,
choices=AVAILABLE_LANGUAGES,
metavar="LANG",
help=(
"Language code (supertonic-3): "
"en, ko, ja, ar, bg, cs, da, de, el, es, et, fi, fr, hi, hr, hu, "
"id, it, lt, lv, nl, pl, pt, ro, ru, sk, sl, sv, tr, uk, vi, "
"or 'na' for unknown / unsupported languages. "
"Default: 'na' for multilingual models (supertonic-2/3), "
"'en' for supertonic v1."
),
)
parser_tts.add_argument(
"--steps", type=int, default=8, help="Quality steps (default: 8, higher=better)"
)
parser_tts.add_argument(
"--speed",
type=float,
default=1.05,
help="Speech speed (0.7-2.0, default: 1.05, 2.0=2x faster)",
)
parser_tts.add_argument(
"--max-chunk-length",
type=int,
default=None,
help="Maximum characters per chunk (default: auto based on language)",
)
parser_tts.add_argument(
"--silence-duration",
type=float,
default=0.3,
help="Silence between chunks in seconds (default: 0.3)",
)
add_common_args(parser_tts)
parser_tts.set_defaults(func=cmd_tts)
# Backward compatibility: synthesize command (deprecated)
parser_synth = subparsers.add_parser(
"synthesize", aliases=["s"], help="(Deprecated: use tts) Generate speech from text"
)
parser_synth.add_argument("text", help="Text to synthesize")
parser_synth.add_argument("-o", "--output", required=True, help="Output WAV file")
parser_synth.add_argument("--voice", default="M1", help="Voice style (default: M1)")
parser_synth.add_argument(
"--steps", type=int, default=8, help="Quality steps (default: 8, higher=better)"
)
add_common_args(parser_synth)
parser_synth.set_defaults(func=cmd_tts)
# List voices command
parser_voices = subparsers.add_parser(
"list-voices", aliases=["lv"], help="List available voice styles"
)
parser_voices.set_defaults(func=cmd_list_voices)
# Info command
parser_info = subparsers.add_parser("info", aliases=["i"], help="Show model information")
parser_info.set_defaults(func=cmd_info)
# Download command
parser_download = subparsers.add_parser(
"download", aliases=["d"], help="Download model from HuggingFace"
)
parser_download.set_defaults(func=cmd_download)
# Version command
parser_version = subparsers.add_parser(
"version", aliases=["v"], help="Show version information"
)
parser_version.set_defaults(func=cmd_version)
# Serve command — local HTTP wrapper. Installed via ``pip install supertonic[serve]``.
parser_serve = subparsers.add_parser(
"serve",
help="Run a local HTTP server exposing /v1/tts (and OpenAI-compatible /v1/audio/speech)",
)
parser_serve.add_argument(
"--host",
default="127.0.0.1",
help="Interface to bind (default: 127.0.0.1; loopback only)",
)
parser_serve.add_argument(
"--port", type=int, default=7788, help="Port to listen on (default: 7788)"
)
parser_serve.add_argument(
"--model",
type=str,
default=DEFAULT_MODEL,
choices=AVAILABLE_MODELS,
help=f"Model to load on startup (default: {DEFAULT_MODEL})",
)
parser_serve.add_argument(
"--cors",
type=str,
default=None,
help=(
"Comma-separated CORS origins to allow (e.g. "
"'http://localhost:*,chrome-extension://*'). "
"Omit to disable CORS entirely."
),
)
parser_serve.add_argument(
"--log-level",
type=str,
default="info",
choices=["critical", "error", "warning", "info", "debug", "trace"],
help="uvicorn log level (default: info)",
)
add_common_args(parser_serve)
parser_serve.set_defaults(func=cmd_serve)
return parser