def create_parser() -> argparse.ArgumentParser:
"""Create and return the CLI argument parser.
This function is separated to allow documentation generation tools
to extract CLI arguments automatically.
Returns:
ArgumentParser configured with all Supertonic CLI commands
"""
parser = argparse.ArgumentParser(
prog="supertonic",
description="Supertonic - High-quality Text-to-Speech synthesis",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Generate and play speech directly (no file saved)
supertonic say 'Hello, welcome to the world!'
# Generate speech from text and save to file
supertonic tts 'Hello, welcome to the world!' -o output.wav
# Use different voice and quality
supertonic say 'This is a female voice style.' --voice F1 --steps 10
supertonic tts 'This is a female voice style.' -o hello.wav --voice F1 --steps 10
# Multilingual support (Korean, Spanish, Portuguese, French)
supertonic say '안녕하세요! 반갑습니다.' --lang ko
supertonic tts 'Bonjour le monde!' -o french.wav --lang fr
supertonic tts 'Hola, bienvenido!' -o spanish.wav --lang es
# Use custom voice style from JSON file
supertonic say 'This is a custom voice test.' --custom-style-path ./my_voice.json
# Long text with custom chunking
supertonic tts 'This is a very long text.' -o output.wav --max-chunk-length 200
# List available voices
supertonic list-voices
""",
)
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# Common arguments helper function
def add_common_args(p):
p.add_argument(
"-v",
"--verbose",
action="store_true",
help="Enable verbose output with detailed logging",
)
# Say command (play audio directly without saving)
parser_say = subparsers.add_parser(
"say", help="Generate speech and play it directly without saving a file"
)
parser_say.add_argument("text", help="Text to synthesize and play")
parser_say.add_argument(
"--model",
type=str,
default=DEFAULT_MODEL,
choices=AVAILABLE_MODELS,
help=f"Model to use: supertonic (English only) or supertonic-2 (multilingual). Default: {DEFAULT_MODEL}",
)
parser_say.add_argument("--voice", default="M1", help="Voice style (default: M1)")
parser_say.add_argument(
"--custom-style-path",
type=str,
default=None,
help="Path to custom voice style JSON file (overrides --voice if provided)",
)
parser_say.add_argument(
"--lang",
type=str,
default="en",
choices=["en", "ko", "es", "pt", "fr"],
help="Language code: en (English), ko (Korean), es (Spanish), pt (Portuguese), fr (French). Default: en",
)
parser_say.add_argument(
"--steps", type=int, default=5, help="Quality steps (default: 5, higher=better)"
)
parser_say.add_argument(
"--speed",
type=float,
default=1.05,
help="Speech speed (0.7-2.0, default: 1.05, 2.0=2x faster)",
)
parser_say.add_argument(
"--max-chunk-length",
type=int,
default=None,
help="Maximum characters per chunk (default: auto based on language)",
)
parser_say.add_argument(
"--silence-duration",
type=float,
default=0.3,
help="Silence between chunks in seconds (default: 0.3)",
)
add_common_args(parser_say)
parser_say.set_defaults(func=cmd_say)
# TTS command
parser_tts = subparsers.add_parser("tts", aliases=["t"], help="Generate speech from text")
parser_tts.add_argument("text", help="Text to synthesize")
parser_tts.add_argument("-o", "--output", required=True, help="Output WAV file")
parser_tts.add_argument(
"--model",
type=str,
default=DEFAULT_MODEL,
choices=AVAILABLE_MODELS,
help=f"Model to use: supertonic (English only) or supertonic-2 (multilingual). Default: {DEFAULT_MODEL}",
)
parser_tts.add_argument("--voice", default="M1", help="Voice style (default: M1)")
parser_tts.add_argument(
"--custom-style-path",
type=str,
default=None,
help="Path to custom voice style JSON file (overrides --voice if provided)",
)
parser_tts.add_argument(
"--lang",
type=str,
default="en",
choices=["en", "ko", "es", "pt", "fr"],
help="Language code: en (English), ko (Korean), es (Spanish), pt (Portuguese), fr (French). Default: en",
)
parser_tts.add_argument(
"--steps", type=int, default=5, help="Quality steps (default: 5, higher=better)"
)
parser_tts.add_argument(
"--speed",
type=float,
default=1.05,
help="Speech speed (0.7-2.0, default: 1.05, 2.0=2x faster)",
)
parser_tts.add_argument(
"--max-chunk-length",
type=int,
default=None,
help="Maximum characters per chunk (default: auto based on language)",
)
parser_tts.add_argument(
"--silence-duration",
type=float,
default=0.3,
help="Silence between chunks in seconds (default: 0.3)",
)
add_common_args(parser_tts)
parser_tts.set_defaults(func=cmd_tts)
# Backward compatibility: synthesize command (deprecated)
parser_synth = subparsers.add_parser(
"synthesize", aliases=["s"], help="(Deprecated: use tts) Generate speech from text"
)
parser_synth.add_argument("text", help="Text to synthesize")
parser_synth.add_argument("-o", "--output", required=True, help="Output WAV file")
parser_synth.add_argument("--voice", default="M1", help="Voice style (default: M1)")
parser_synth.add_argument(
"--steps", type=int, default=5, help="Quality steps (default: 5, higher=better)"
)
add_common_args(parser_synth)
parser_synth.set_defaults(func=cmd_tts)
# List voices command
parser_voices = subparsers.add_parser(
"list-voices", aliases=["lv"], help="List available voice styles"
)
parser_voices.set_defaults(func=cmd_list_voices)
# Info command
parser_info = subparsers.add_parser("info", aliases=["i"], help="Show model information")
parser_info.set_defaults(func=cmd_info)
# Download command
parser_download = subparsers.add_parser(
"download", aliases=["d"], help="Download model from HuggingFace"
)
parser_download.set_defaults(func=cmd_download)
# Version command
parser_version = subparsers.add_parser(
"version", aliases=["v"], help="Show version information"
)
parser_version.set_defaults(func=cmd_version)
return parser