Skip to content

supertonic.core

supertonic.core

Core TTS engine and text processing components.

This module contains the main Supertonic TTS engine, text processor, and supporting utilities for audio synthesis.

Classes:

Name Description
UnicodeProcessor

Processes text into unicode indices for the TTS model.

Style

Voice style representation for TTS synthesis.

Supertonic

Core TTS engine for Supertonic speech synthesis.

Functions:

Name Description
length_to_mask

Convert lengths to binary mask.

get_latent_mask

Generate mask for latent representations.

Attributes:

Name Type Description
logger

logger module-attribute

logger = getLogger(__name__)

length_to_mask

length_to_mask(
    lengths: ndarray, max_len: Optional[int] = None
) -> ndarray

Convert lengths to binary mask.

Parameters:

Name Type Description Default
lengths ndarray

(B,)

required
max_len Optional[int]

int

None

Returns:

Name Type Description
mask ndarray

(B, 1, max_len)

Source code in supertonic/core.py
def length_to_mask(lengths: np.ndarray, max_len: Optional[int] = None) -> np.ndarray:
    """
    Convert lengths to binary mask.

    Args:
        lengths: (B,)
        max_len: int

    Returns:
        mask: (B, 1, max_len)
    """
    max_len = max_len or lengths.max()
    ids = np.arange(0, max_len)
    mask = (ids < np.expand_dims(lengths, axis=1)).astype(np.float32)
    return mask.reshape(-1, 1, max_len)

get_latent_mask

get_latent_mask(
    wav_lengths: ndarray,
    base_chunk_size: int,
    chunk_compress_factor: int,
) -> ndarray

Generate mask for latent representations.

Source code in supertonic/core.py
def get_latent_mask(
    wav_lengths: np.ndarray, base_chunk_size: int, chunk_compress_factor: int
) -> np.ndarray:
    """Generate mask for latent representations."""
    latent_size = base_chunk_size * chunk_compress_factor
    latent_lengths = (wav_lengths + latent_size - 1) // latent_size
    latent_mask = length_to_mask(latent_lengths)
    return latent_mask

UnicodeProcessor

UnicodeProcessor(unicode_indexer_path: str)

Processes text into unicode indices for the TTS model.

This class handles text preprocessing, normalization, and conversion to numeric indices that the TTS model can understand.

Parameters:

Name Type Description Default
unicode_indexer_path str

Path to the unicode indexer JSON file

required

Methods:

Name Description
validate_text

Validate if text can be processed by the model.

validate_text_list

Validate a list of texts.

Attributes:

Name Type Description
indexer
supported_chars
supported_character_set set[str]
Source code in supertonic/core.py
def __init__(self, unicode_indexer_path: str):
    self.indexer = self._load_indexer(unicode_indexer_path)
    self.supported_chars = self._make_supported_characters()

indexer instance-attribute

indexer = _load_indexer(unicode_indexer_path)

supported_chars instance-attribute

supported_chars = _make_supported_characters()

supported_character_set property

supported_character_set: set[str]

validate_text

validate_text(text: str) -> tuple[bool, list[str]]

Validate if text can be processed by the model.

Parameters:

Name Type Description Default
text str

Text to validate

required

Returns:

Type Description
tuple[bool, list[str]]

Tuple of (is_valid, unsupported_chars): - is_valid: True if text can be processed - unsupported_chars: List of unsupported characters (empty if valid)

Example
processor = UnicodeProcessor("unicode_indexer.json")
is_valid, unsupported = processor.validate_text("Hello world")
if not is_valid:
    print(f"Cannot process: {unsupported}")
Source code in supertonic/core.py
def validate_text(self, text: str) -> tuple[bool, list[str]]:
    """Validate if text can be processed by the model.

    Args:
        text: Text to validate

    Returns:
        Tuple of (is_valid, unsupported_chars):
            - is_valid: True if text can be processed
            - unsupported_chars: List of unsupported characters (empty if valid)

    Example:
        ```python
        processor = UnicodeProcessor("unicode_indexer.json")
        is_valid, unsupported = processor.validate_text("Hello world")
        if not is_valid:
            print(f"Cannot process: {unsupported}")
        ```
    """
    input_chars = set(text)
    unsupported_chars = set()
    for input_char in input_chars:
        p_chars = set(self._preprocess_text(input_char))
        us_chars = p_chars - self.supported_character_set
        if len(us_chars) > 0:
            unsupported_chars.update(input_char)
    return len(unsupported_chars) == 0, sorted(list(unsupported_chars))

validate_text_list

validate_text_list(
    text_list: list[str],
) -> tuple[bool, list[str]]

Validate a list of texts.

Source code in supertonic/core.py
def validate_text_list(self, text_list: list[str]) -> tuple[bool, list[str]]:
    """Validate a list of texts."""
    text_cat = "".join(text_list)
    return self.validate_text(text_cat)

Style

Style(style_ttl_onnx: ndarray, style_dp_onnx: ndarray)

Voice style representation for TTS synthesis.

This class encapsulates the style vectors used to control the voice characteristics during speech synthesis.

Parameters:

Name Type Description Default
style_ttl_onnx ndarray

Style vector for the text-to-latent model

required
style_dp_onnx ndarray

Style vector for the duration predictor

required

Attributes:

Name Type Description
ttl ndarray

Text-to-latent style vector

dp ndarray

Duration predictor style vector

Source code in supertonic/core.py
def __init__(self, style_ttl_onnx: np.ndarray, style_dp_onnx: np.ndarray):
    # Validate types
    if not isinstance(style_ttl_onnx, np.ndarray):
        raise TypeError(f"style_ttl must be numpy array, got {type(style_ttl_onnx).__name__}")
    if not isinstance(style_dp_onnx, np.ndarray):
        raise TypeError(f"style_dp must be numpy array, got {type(style_dp_onnx).__name__}")

    self.ttl = style_ttl_onnx
    self.dp = style_dp_onnx

ttl instance-attribute

ttl = style_ttl_onnx

dp instance-attribute

dp = style_dp_onnx

Supertonic

Supertonic(
    cfgs: dict,
    text_processor: UnicodeProcessor,
    dp_ort: InferenceSession,
    text_enc_ort: InferenceSession,
    vector_est_ort: InferenceSession,
    vocoder_ort: InferenceSession,
)

Core TTS engine for Supertonic speech synthesis.

This class orchestrates the entire text-to-speech pipeline, from text encoding through duration prediction and waveform generation.

Parameters:

Name Type Description Default
cfgs dict

Model configuration dictionary

required
text_processor UnicodeProcessor

Unicode text processor instance

required
dp_ort InferenceSession

Duration predictor ONNX session

required
text_enc_ort InferenceSession

Text encoder ONNX session

required
vector_est_ort InferenceSession

Vector estimator ONNX session

required
vocoder_ort InferenceSession

Vocoder ONNX session

required

Attributes:

Name Type Description
sample_rate int

Audio sample rate in Hz

base_chunk_size int

Base chunk size for latent representation

chunk_compress_factor int

Compression factor for chunks

ldim int

Latent dimension size

Methods:

Name Description
sample_noisy_latent
Source code in supertonic/core.py
def __init__(
    self,
    cfgs: dict,
    text_processor: UnicodeProcessor,
    dp_ort: ort.InferenceSession,
    text_enc_ort: ort.InferenceSession,
    vector_est_ort: ort.InferenceSession,
    vocoder_ort: ort.InferenceSession,
):
    # Validate input types
    if not isinstance(text_processor, UnicodeProcessor):
        raise TypeError(
            f"text_processor must be UnicodeProcessor, got {type(text_processor).__name__}"
        )

    for name, session in [
        ("dp_ort", dp_ort),
        ("text_enc_ort", text_enc_ort),
        ("vector_est_ort", vector_est_ort),
        ("vocoder_ort", vocoder_ort),
    ]:
        if not isinstance(session, ort.InferenceSession):
            raise TypeError(f"{name} must be InferenceSession, got {type(session).__name__}")

    self.cfgs = cfgs
    self.text_processor = text_processor
    self.dp_ort = dp_ort
    self.text_enc_ort = text_enc_ort
    self.vector_est_ort = vector_est_ort
    self.vocoder_ort = vocoder_ort

    try:
        self.sample_rate = cfgs["ae"]["sample_rate"]
        self.base_chunk_size = cfgs["ae"]["base_chunk_size"]
        self.chunk_compress_factor = cfgs["ttl"]["chunk_compress_factor"]
        self.ldim = cfgs["ttl"]["latent_dim"]
    except KeyError as e:
        logger.error(f"Missing required config key: {e}")
        raise ValueError(
            f"Model configuration is incomplete. Missing key: {e}. "
            f"Please ensure you have downloaded the correct model files."
        ) from e

    logger.info(
        f"Initialized Supertonic engine (sample_rate={self.sample_rate}Hz, "
        f"latent_dim={self.ldim})"
    )

cfgs instance-attribute

cfgs = cfgs

text_processor instance-attribute

text_processor = text_processor

dp_ort instance-attribute

dp_ort = dp_ort

text_enc_ort instance-attribute

text_enc_ort = text_enc_ort

vector_est_ort instance-attribute

vector_est_ort = vector_est_ort

vocoder_ort instance-attribute

vocoder_ort = vocoder_ort

sample_rate instance-attribute

sample_rate = cfgs['ae']['sample_rate']

base_chunk_size instance-attribute

base_chunk_size = cfgs['ae']['base_chunk_size']

chunk_compress_factor instance-attribute

chunk_compress_factor = cfgs["ttl"]["chunk_compress_factor"]

ldim instance-attribute

ldim = cfgs['ttl']['latent_dim']

sample_noisy_latent

sample_noisy_latent(
    duration: ndarray,
) -> tuple[ndarray, ndarray]
Source code in supertonic/core.py
def sample_noisy_latent(self, duration: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    bsz = len(duration)
    wav_len_max = duration.max() * self.sample_rate
    wav_lengths = (duration * self.sample_rate).astype(np.int64)
    chunk_size = self.base_chunk_size * self.chunk_compress_factor
    latent_len = ((wav_len_max + chunk_size - 1) / chunk_size).astype(np.int32)
    latent_dim = self.ldim * self.chunk_compress_factor
    noisy_latent = np.random.randn(bsz, latent_dim, latent_len).astype(np.float32)
    latent_mask = get_latent_mask(wav_lengths, self.base_chunk_size, self.chunk_compress_factor)
    noisy_latent = noisy_latent * latent_mask
    return noisy_latent, latent_mask