PhoneticArtsFile Config
Version 01

## -----------------------------------------------------------------------------
## Configuration for building the HMM voice based on MCEP SWOP, AP, MSD
## -----------------------------------------------------------------------------
#
# Vocal tract
#
#    MCEP: conventional mel-cepstral
#    MCEP SWOP: mel-cepstral from SWOP spectral envelope
#    LSP: Linear Spectral Pairs
#    LSP SWOP: mel-cepstral from SWOP spectral envelope
#
# Mixed excitation
#
#    VS: voicing strenghts
#    AP: aperiodicity
#
# F0
#
#    MSD: Multi Space Distrubution
#    GTD: Global Tied Distribution
#

synthesis_method        "hmm" ;
unit_type               "phone" ;
hmm_quinphone_context   false ;

## -----------------------------------------------------------------------------
## HTS-specific:
## -----------------------------------------------------------------------------

#
# Compression:
#
#     0: No compression (HTS)
#     1: Compress PDFs into shorts
#     2: Compress PDFs into bytes
#     3: Memory mapped binary HMM voice
#
hmm_compression 3 ;

##
## DSP-specific:
##

#
# Voice type:
#
#     0: MCEP + log(F0),
#     1: LSP + Mixed Excitation + log(F0)
#     2: MCEP + Mixed Excitation + log(F0)
#     3: STR + MCEP + log(F0)
#     4: STR + LSP + log(F0)
#     5: STR + AP + MCEP + log(F0)
#     6: AP + MCEP + log(F0)
#     7: AP + LSP + log(F0)
#     8: MCEP + Mixed Excitation + log(F0)+GTD
#
hmm_voice_type 7 ;

# sampling rate (Hz)
hmm_sample_rate 22050 ;

# frame period (in points)
hmm_frame_period 110 ;

# frequency warping factor
hmm_freq_warp 0.46 ;

# Mel-generalized gamma
hmm_mgc_gamma_stage 1.0 ;

##
## HMM:
##

# number of states in an HMM
hmm_num_states 5 ;

# maximal number of EM iterations
hmm_max_em_steps 5 ;

# mixture weight flooring scale
hmm_mixture_weight_floor_scale 3.0 ;

#
# values per-acoustic or mmf stream type:
#
# streams: ME-LSP, LF0, DUR
# note: negative value means ``don't care''
#

# minimal state occupancy:
hmm_min_state_occupancy { 10.0 10.0 10.0 5.0 } ;

# MDL stopping criteria for clustering:
hmm_mdl_lambda { 1.0 1.0 1.0 1.0 } ;

# statistics load threshold (R0 command):
hmm_stats_gamma { 0.0 0.0 0.0 0.0 } ;

# stream name
hmm_stream_name { "mcp" "me" "lf0" "dur" } ;

# start index of each HMM stream
hmm_stream_start { 1 2 3 1 } ;

# end index of each HMM stream
hmm_stream_end { 1 2 5 5 } ;

# whether the stream is MSD
hmm_stream_msd { 0 0 1 -1 } ;

# stream weights
hmm_stream_weights { 1.0 1.0 1.0 -1.0 } ;

# static feature order
hmm_static_order { 25 7 1 -1 } ;

# number of dynamic windows
hmm_dynamic_windows { 3 3 3 -1 } ;

# variance floor (vocal tract, me, lf0 x 3)
hmm_variance_floor { 0.01 0.01 0.01 0.01 0.01 } ;


## -----------------------------------------------------------------------------
## Linguistic (Label Generation Specific Parameters):
## -----------------------------------------------------------------------------

###### Definition of possible feature values (used for fast unit selection features and hts) ]################
features.phrasing               { "mid" "p-final" "s-final" "s-final-q" };
features.phon_pos               { "i11" "i12" "i13" "i22" "i23" "i33" "v" "f11" "f12" "f13" "f22" "f23" "f33" "#i11" "#i12" "#i13" "#v" "v#" "#v#" "f11#" "f22#" "f33#" "sil" "#sil#" };
features.description            { "vowel" "consonant" };
features.manner_group           { "obstruent" "sonorant" };
features.height                 { "close" "nearclose" "closemid" "mid" "openmid" "nearopen" "open" };
features.vpos                   { "front" "nearfront" "central" "nearback" "back" };
features.rounding               { "rounded" "unrounded" };
features.voicing                { "voiced" "unvoiced" };
features.poa                    { "labial" "dental" "alveolar" "postalveolar" "palatal" "velar" "glottal" };
features.manner                 { "approximant" "lateral" "nasal" "affricate" "stop" "fricative" };
features.tar_height             { "close" "nearclose" "closemid" "mid" "openmid" "nearopen" "open" };
features.tar_vpos               { "front" "nearfront" "central" "nearback" "back" };
features.tar_rounding           { "rounded" "unrounded" };
features.diphthong              { "diphthong" };
features.diphthong_tar_height   { "target_closemid" };
features.diphthong_tar_vpos     { "target_nearfront" "target_nearback" };
features.diphthong_tar_rounding { "target_rounded" "target_unrounded" };
features.quantity               { "short" "long" };
features.onset_consonant        { "onset_consonant" };
features.coda_consonant         { "coda_consonant" };
features.schwa_vowels           { "schwa_vowels" };
features.full_vowels            { "full_vowels" };
features.open_syllable_vowel    { "open_syllable_vowel" };
features.closed_syllable_vowel  { "closed_syllable_vowel" };
features.one_initial            { "one_initial" };
features.three_initial          { "three_initial" };
features.one_final              { "one_final" };
features.continuant             { "continuant" };
features.retroflex              { "retroflex" };
features.strident               { "strident" };
features.high_vowel             { "high_vowel" };
features.low_vowel              { "low_vowel" };
features.coronal                { "coronal" };
features.semi_vowel             { "semi_vowel" };
features.lax                    { "lax" };
features.liquid                 { "liquid" };
features.name                   { "*" };

## repeat of phone_defs.word_type - make sure these are the same.
features.word_type    $phone_defs.word_type ;

## mapping of feature names to hts parameters
hts_features.name                "A";
hts_features.phon_pos            "B";
hts_features.sonorant_pollution  "C";
hts_features.stress              "D";
hts_features.phrasing            "E";
hts_features.word_type           "F";
hts_features.description         "phone_defs";
hts_features.manner_group        "phone_defs";
hts_features.height              "phone_defs";
hts_features.tar_height          "phone_defs";
hts_features.vpos                "phone_defs";
hts_features.tar_vpos            "phone_defs";
hts_features.rounding            "phone_defs";
hts_features.tar_rounding        "phone_defs";
hts_features.voicing             "phone_defs";
hts_features.poa                 "phone_defs";
hts_features.manner              "phone_defs";
hts_features.diphthong              "phone_defs";
hts_features.diphthong_tar_height   "phone_defs";
hts_features.diphthong_tar_vpos     "phone_defs";
hts_features.diphthong_tar_rounding "phone_defs";
hts_features.quantity               "phone_defs";
hts_features.onset_consonant        "phone_defs";
hts_features.coda_consonant         "phone_defs";
hts_features.schwa_vowels           "phone_defs";
hts_features.full_vowels            "phone_defs";
hts_features.open_syllable_vowel    "phone_defs";
hts_features.closed_syllable_vowel  "phone_defs";
hts_features.one_initial            "phone_defs";
hts_features.three_initial          "phone_defs";
hts_features.one_final              "phone_defs";
hts_features.continuant             "phone_defs";
hts_features.retroflex              "phone_defs";
hts_features.strident               "phone_defs";
hts_features.high_vowel             "phone_defs";
hts_features.low_vowel              "phone_defs";
hts_features.coronal                "phone_defs";
hts_features.semi_vowel             "phone_defs";
hts_features.lax                    "phone_defs";
hts_features.liquid                 "phone_defs";

###### Definition of specification ##################
## These are the features used in the model target
## cost calculation and for HTS.
specification.segment.this { "name" "description" "manner_group" "height" "tar_height" "vpos" "tar_vpos" "rounding" "tar_rounding" "voicing" "poa" "manner" "diphthong" "diphthong_tar_height" "diphthong_tar_vpos" "diphthong_tar_rounding" "quantity" "onset_consonant" "coda_consonant" "schwa_vowels" "full_vowels" "open_syllable_vowel" "closed_syllable_vowel" "one_initial" "three_initial" "one_final" "continuant" "retroflex" "strident" "high_vowel" "low_vowel" "coronal" "semi_vowel" "lax" "liquid" "stress" "word_type" "phrasing" "sonorant_pollution" "phon_pos" };
specification.segment.prev { "name" "description" "manner_group" "height" "tar_height" "vpos" "tar_vpos" "rounding" "tar_rounding" "voicing" "poa" "manner" "diphthong" "diphthong_tar_height" "diphthong_tar_vpos" "diphthong_tar_rounding" "quantity" "onset_consonant" "coda_consonant" "schwa_vowels" "full_vowels" "open_syllable_vowel" "closed_syllable_vowel" "one_initial" "three_initial" "one_final" "continuant" "retroflex" "strident" "high_vowel" "low_vowel" "coronal" "semi_vowel" "lax" "liquid" "stress" "word_type" "phrasing" };
specification.segment.next { "name" "description" "manner_group" "height" "tar_height" "vpos" "tar_vpos" "rounding" "tar_rounding" "voicing" "poa" "manner" "diphthong" "diphthong_tar_height" "diphthong_tar_vpos" "diphthong_tar_rounding" "quantity" "onset_consonant" "coda_consonant" "schwa_vowels" "full_vowels" "open_syllable_vowel" "closed_syllable_vowel" "one_initial" "three_initial" "one_final" "continuant" "retroflex" "strident" "high_vowel" "low_vowel" "coronal" "semi_vowel" "lax" "liquid" "stress" "word_type" "phrasing" "sonorant_pollution" };
specification.syllable.prev  { "stress" };
specification.syllable.next  { "stress" };