supertonic.core¶

supertonic.core ¶

Core TTS engine and text processing components.

This module contains the main Supertonic TTS engine, text processor, and supporting utilities for audio synthesis.

Classes:

Name	Description
`UnicodeProcessor`	Processes text into unicode indices for the TTS model.
`Style`	Voice style representation for TTS synthesis.
`Supertonic`	Core TTS engine for Supertonic speech synthesis.

Functions:

Name	Description
`length_to_mask`	Convert lengths to binary mask.
`get_latent_mask`	Generate mask for latent representations.

Attributes:

Name	Type	Description
`logger`

logger `module-attribute` ¶

logger = getLogger(__name__)

length_to_mask ¶

length_to_mask(
    lengths: ndarray, max_len: Optional[int] = None
) -> ndarray

Convert lengths to binary mask.

Parameters:

Name	Type	Description	Default
`lengths`	`ndarray`	(B,)	required
`max_len`	`Optional[int]`	int	`None`

Returns:

Name	Type	Description
`mask`	`ndarray`	(B, 1, max_len)

Source code in supertonic/core.py

def length_to_mask(lengths: np.ndarray, max_len: Optional[int] = None) -> np.ndarray:
    """
    Convert lengths to binary mask.

    Args:
        lengths: (B,)
        max_len: int

    Returns:
        mask: (B, 1, max_len)
    """
    max_len = max_len or lengths.max()
    ids = np.arange(0, max_len)
    mask = (ids < np.expand_dims(lengths, axis=1)).astype(np.float32)
    return mask.reshape(-1, 1, max_len)

get_latent_mask ¶

get_latent_mask(
    wav_lengths: ndarray,
    base_chunk_size: int,
    chunk_compress_factor: int,
) -> ndarray

Generate mask for latent representations.

Source code in supertonic/core.py

def get_latent_mask(
    wav_lengths: np.ndarray, base_chunk_size: int, chunk_compress_factor: int
) -> np.ndarray:
    """Generate mask for latent representations."""
    latent_size = base_chunk_size * chunk_compress_factor
    latent_lengths = (wav_lengths + latent_size - 1) // latent_size
    latent_mask = length_to_mask(latent_lengths)
    return latent_mask

UnicodeProcessor ¶

UnicodeProcessor(unicode_indexer_path: str)

Processes text into unicode indices for the TTS model.

This class handles text preprocessing, normalization, and conversion to numeric indices that the TTS model can understand.

Parameters:

Name	Type	Description	Default
`unicode_indexer_path`	`str`	Path to the unicode indexer JSON file	required

Methods:

Name	Description
`validate_text`	Validate if text can be processed by the model.
`validate_text_list`	Validate a list of texts.

Attributes:

Name	Type	Description
`indexer`
`supported_chars`
`supported_character_set`	`set[str]`

Source code in supertonic/core.py

def __init__(self, unicode_indexer_path: str):
    self.indexer = self._load_indexer(unicode_indexer_path)
    self.supported_chars = self._make_supported_characters()

indexer `instance-attribute` ¶

indexer = _load_indexer(unicode_indexer_path)

supported_chars `instance-attribute` ¶

supported_chars = _make_supported_characters()

supported_character_set `property` ¶

supported_character_set: set[str]

validate_text ¶

validate_text(text: str) -> tuple[bool, list[str]]

Validate if text can be processed by the model.

Parameters:

Name	Type	Description	Default
`text`	`str`	Text to validate	required

Returns:

Type	Description
`tuple[bool, list[str]]`	Tuple of (is_valid, unsupported_chars): - is_valid: True if text can be processed - unsupported_chars: List of unsupported characters (empty if valid)

Example

processor = UnicodeProcessor("unicode_indexer.json")
is_valid, unsupported = processor.validate_text("Hello world")
if not is_valid:
    print(f"Cannot process: {unsupported}")

Source code in supertonic/core.py

def validate_text(self, text: str) -> tuple[bool, list[str]]:
    """Validate if text can be processed by the model.

    Args:
        text: Text to validate

    Returns:
        Tuple of (is_valid, unsupported_chars):
            - is_valid: True if text can be processed
            - unsupported_chars: List of unsupported characters (empty if valid)

    Example:
        ```python
        processor = UnicodeProcessor("unicode_indexer.json")
        is_valid, unsupported = processor.validate_text("Hello world")
        if not is_valid:
            print(f"Cannot process: {unsupported}")
        ```
    """
    input_chars = set(text)
    unsupported_chars = set()
    for input_char in input_chars:
        p_chars = set(self._preprocess_text(input_char))
        us_chars = p_chars - self.supported_character_set
        if len(us_chars) > 0:
            unsupported_chars.update(input_char)
    return len(unsupported_chars) == 0, sorted(list(unsupported_chars))

validate_text_list ¶

validate_text_list(
    text_list: list[str],
) -> tuple[bool, list[str]]

Validate a list of texts.

Source code in supertonic/core.py

def validate_text_list(self, text_list: list[str]) -> tuple[bool, list[str]]:
    """Validate a list of texts."""
    text_cat = "".join(text_list)
    return self.validate_text(text_cat)

Style ¶

Style(style_ttl_onnx: ndarray, style_dp_onnx: ndarray)

Voice style representation for TTS synthesis.

This class encapsulates the style vectors used to control the voice characteristics during speech synthesis.

Parameters:

Name	Type	Description	Default
`style_ttl_onnx`	`ndarray`	Style vector for the text-to-latent model	required
`style_dp_onnx`	`ndarray`	Style vector for the duration predictor	required

Attributes:

Name	Type	Description
`ttl`	`ndarray`	Text-to-latent style vector
`dp`	`ndarray`	Duration predictor style vector

Source code in supertonic/core.py

def __init__(self, style_ttl_onnx: np.ndarray, style_dp_onnx: np.ndarray):
    # Validate types
    if not isinstance(style_ttl_onnx, np.ndarray):
        raise TypeError(f"style_ttl must be numpy array, got {type(style_ttl_onnx).__name__}")
    if not isinstance(style_dp_onnx, np.ndarray):
        raise TypeError(f"style_dp must be numpy array, got {type(style_dp_onnx).__name__}")

    self.ttl = style_ttl_onnx
    self.dp = style_dp_onnx

ttl `instance-attribute` ¶

ttl = style_ttl_onnx

dp `instance-attribute` ¶

dp = style_dp_onnx

Supertonic ¶

Supertonic(
    cfgs: dict,
    text_processor: UnicodeProcessor,
    dp_ort: InferenceSession,
    text_enc_ort: InferenceSession,
    vector_est_ort: InferenceSession,
    vocoder_ort: InferenceSession,
)

Core TTS engine for Supertonic speech synthesis.

This class orchestrates the entire text-to-speech pipeline, from text encoding through duration prediction and waveform generation.

Parameters:

Name	Type	Description	Default
`cfgs`	`dict`	Model configuration dictionary	required
`text_processor`	`UnicodeProcessor`	Unicode text processor instance	required
`dp_ort`	`InferenceSession`	Duration predictor ONNX session	required
`text_enc_ort`	`InferenceSession`	Text encoder ONNX session	required
`vector_est_ort`	`InferenceSession`	Vector estimator ONNX session	required
`vocoder_ort`	`InferenceSession`	Vocoder ONNX session	required

Attributes:

Name	Type	Description
`sample_rate`	`int`	Audio sample rate in Hz
`base_chunk_size`	`int`	Base chunk size for latent representation
`chunk_compress_factor`	`int`	Compression factor for chunks
`ldim`	`int`	Latent dimension size

Methods:

Name	Description
`sample_noisy_latent`

Source code in supertonic/core.py

def __init__(
    self,
    cfgs: dict,
    text_processor: UnicodeProcessor,
    dp_ort: ort.InferenceSession,
    text_enc_ort: ort.InferenceSession,
    vector_est_ort: ort.InferenceSession,
    vocoder_ort: ort.InferenceSession,
):
    # Validate input types
    if not isinstance(text_processor, UnicodeProcessor):
        raise TypeError(
            f"text_processor must be UnicodeProcessor, got {type(text_processor).__name__}"
        )

    for name, session in [
        ("dp_ort", dp_ort),
        ("text_enc_ort", text_enc_ort),
        ("vector_est_ort", vector_est_ort),
        ("vocoder_ort", vocoder_ort),
    ]:
        if not isinstance(session, ort.InferenceSession):
            raise TypeError(f"{name} must be InferenceSession, got {type(session).__name__}")

    self.cfgs = cfgs
    self.text_processor = text_processor
    self.dp_ort = dp_ort
    self.text_enc_ort = text_enc_ort
    self.vector_est_ort = vector_est_ort
    self.vocoder_ort = vocoder_ort

    try:
        self.sample_rate = cfgs["ae"]["sample_rate"]
        self.base_chunk_size = cfgs["ae"]["base_chunk_size"]
        self.chunk_compress_factor = cfgs["ttl"]["chunk_compress_factor"]
        self.ldim = cfgs["ttl"]["latent_dim"]
    except KeyError as e:
        logger.error(f"Missing required config key: {e}")
        raise ValueError(
            f"Model configuration is incomplete. Missing key: {e}. "
            f"Please ensure you have downloaded the correct model files."
        ) from e

    logger.info(
        f"Initialized Supertonic engine (sample_rate={self.sample_rate}Hz, "
        f"latent_dim={self.ldim})"
    )

cfgs `instance-attribute` ¶

cfgs = cfgs

text_processor `instance-attribute` ¶

text_processor = text_processor

dp_ort `instance-attribute` ¶

dp_ort = dp_ort

text_enc_ort `instance-attribute` ¶

text_enc_ort = text_enc_ort

vector_est_ort `instance-attribute` ¶

vector_est_ort = vector_est_ort

vocoder_ort `instance-attribute` ¶

vocoder_ort = vocoder_ort

sample_rate `instance-attribute` ¶

sample_rate = cfgs['ae']['sample_rate']

base_chunk_size `instance-attribute` ¶

base_chunk_size = cfgs['ae']['base_chunk_size']

chunk_compress_factor `instance-attribute` ¶

chunk_compress_factor = cfgs["ttl"]["chunk_compress_factor"]

ldim `instance-attribute` ¶

ldim = cfgs['ttl']['latent_dim']

sample_noisy_latent ¶

sample_noisy_latent(
    duration: ndarray,
) -> tuple[ndarray, ndarray]

Source code in supertonic/core.py

def sample_noisy_latent(self, duration: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    bsz = len(duration)
    wav_len_max = duration.max() * self.sample_rate
    wav_lengths = (duration * self.sample_rate).astype(np.int64)
    chunk_size = self.base_chunk_size * self.chunk_compress_factor
    latent_len = ((wav_len_max + chunk_size - 1) / chunk_size).astype(np.int32)
    latent_dim = self.ldim * self.chunk_compress_factor
    noisy_latent = np.random.randn(bsz, latent_dim, latent_len).astype(np.float32)
    latent_mask = get_latent_mask(wav_lengths, self.base_chunk_size, self.chunk_compress_factor)
    noisy_latent = noisy_latent * latent_mask
    return noisy_latent, latent_mask

supertonic.core¶

supertonic.core ¶

logger module-attribute ¶

length_to_mask ¶

get_latent_mask ¶

UnicodeProcessor ¶

indexer instance-attribute ¶

supported_chars instance-attribute ¶

supported_character_set property ¶

validate_text ¶

validate_text_list ¶

Style ¶

ttl instance-attribute ¶

dp instance-attribute ¶

Supertonic ¶

cfgs instance-attribute ¶

text_processor instance-attribute ¶

dp_ort instance-attribute ¶

text_enc_ort instance-attribute ¶

vector_est_ort instance-attribute ¶

vocoder_ort instance-attribute ¶

sample_rate instance-attribute ¶

base_chunk_size instance-attribute ¶

chunk_compress_factor instance-attribute ¶

ldim instance-attribute ¶

sample_noisy_latent ¶

logger `module-attribute` ¶

indexer `instance-attribute` ¶

supported_chars `instance-attribute` ¶

supported_character_set `property` ¶

ttl `instance-attribute` ¶

dp `instance-attribute` ¶

cfgs `instance-attribute` ¶

text_processor `instance-attribute` ¶

dp_ort `instance-attribute` ¶

text_enc_ort `instance-attribute` ¶

vector_est_ort `instance-attribute` ¶

vocoder_ort `instance-attribute` ¶

sample_rate `instance-attribute` ¶

base_chunk_size `instance-attribute` ¶

chunk_compress_factor `instance-attribute` ¶

ldim `instance-attribute` ¶