Commit 8382fb46 by ntut

Initial commit

parents
File added
../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
# e.g.
# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
#
# Options:
# --time <time>: Limit the maximum time to execute.
# --mem <mem>: Limit the maximum memory usage.
# -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
# --num-threads <ngpu>: Specify the number of CPU core.
# --gpu <ngpu>: Specify the number of GPU devices.
# --config: Change the configuration file from default.
#
# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
#
# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
# These options are mapping to specific options for each backend and
# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
# If jobs failed, your configuration might be wrong for your environment.
#
#
# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
# "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
# =========================================================~
# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
cmd_backend='sge'
# Local machine, without any Job scheduling system
if [ "${cmd_backend}" = local ]; then
# The other usage
export train_cmd="run.pl"
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
export cuda_cmd="run.pl"
# Used for "*_recog.py"
export decode_cmd="run.pl"
# Local machine logging to stdout and log file, without any Job scheduling system
elif [ "${cmd_backend}" = stdout ]; then
# The other usage
export train_cmd="stdout.pl"
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
export cuda_cmd="stdout.pl"
# Used for "*_recog.py"
export decode_cmd="stdout.pl"
# "qsub" (Sun Grid Engine, or derivation of it)
elif [ "${cmd_backend}" = sge ]; then
# The default setting is written in conf/queue.conf.
# You must change "-q g.q" for the "queue" for your environment.
# To know the "queue" names, type "qhost -q"
# Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
export train_cmd="queue.pl --mem 20G --config conf/queue.conf"
export cuda_cmd="queue.pl --mem 20G --config conf/queue.conf"
export decode_cmd="queue.pl"
# "qsub" (Torque/PBS.)
elif [ "${cmd_backend}" = pbs ]; then
# The default setting is written in conf/pbs.conf.
export train_cmd="pbs.pl"
export cuda_cmd="pbs.pl"
export decode_cmd="pbs.pl"
# "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
# To know the "partion" names, type "sinfo".
# You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
export train_cmd="slurm.pl"
export cuda_cmd="slurm.pl"
export decode_cmd="slurm.pl"
elif [ "${cmd_backend}" = ssh ]; then
# You have to create ".queue/machines" to specify the host to execute jobs.
# e.g. .queue/machines
# host1
# host2
# host3
# Assuming you can login them without any password, i.e. You have to set ssh keys.
export train_cmd="ssh.pl"
export cuda_cmd="ssh.pl"
export decode_cmd="ssh.pl"
# This is an example of specifying several unique options in the JHU CLSP cluster setup.
# Users can modify/add their own command options according to their cluster environments.
elif [ "${cmd_backend}" = jhu ]; then
export train_cmd="queue.pl --mem 2G"
export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
export decode_cmd="queue.pl --mem 4G"
else
echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
return 1
fi
File added
lm_weight: 0.6
ctc_weight: 0.4
beam_size: 3
penalty: 0.0
maxlenratio: 0.0
minlenratio: 0.0
streaming: True
tuning/decode_asr_streaming.yaml
\ No newline at end of file
--sample-frequency=16000
--num-mel-bins=80
# Default configuration
command qsub -V -v PATH -S /bin/bash
option name=* -N $0
option mem=* -l mem=$0
option mem=0 # Do not add anything to qsub_opts
option num_threads=* -l ncpus=$0
option num_threads=1 # Do not add anything to qsub_opts
option num_nodes=* -l nodes=$0:ppn=1
default gpu=0
option gpu=0
option gpu=* -l ngpus=$0
--sample-frequency=16000
# Default configuration
command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
option name=* -N $0
option mem=* -l mem_free=$0,ram_free=$0
option mem=0 # Do not add anything to qsub_opts
option num_threads=* -pe smp $0
option num_threads=1 # Do not add anything to qsub_opts
option max_jobs_run=* -tc $0
option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1
default gpu=0
option gpu=0
option gpu=* -l gpu=$0 -q Default
# Default configuration
command sbatch --export=PATH
option name=* --job-name $0
option time=* --time $0
option mem=* --mem-per-cpu $0
option mem=0
option num_threads=* --cpus-per-task $0
option num_threads=1 --cpus-per-task 1
option num_nodes=* --nodes $0
default gpu=0
option gpu=0 -p cpu
option gpu=* -p gpu --gres=gpu:$0 -c $0 # Recommend allocating more CPU than, or equal to the number of GPU
# note: the --max-jobs-run option is supported as a special case
# by slurm.pl and you don't have to handle it in the config file.
tuning/train_asr_streaming_transformer.yaml
\ No newline at end of file
tuning/train_asr_transformer.yaml
\ No newline at end of file
tuning/train_lm_adam.yaml
\ No newline at end of file
tuning/train_lm_transformer.yaml
\ No newline at end of file
# lm_weight: 0.3
beam_size: 10
penalty: 0.0
maxlenratio: 0.0
minlenratio: 0.0
ctc_weight: 0.3
sim_chunk_length: 512
disable_repetition_detection: true
decoder_text_length_limit: 0
encoded_feat_length_limit: 0
batch_type: numel
batch_bins: 140000000
accum_grad: 1
max_epoch: 60
patience: none
init: xavier_uniform
val_scheduler_criterion:
- valid
- acc
best_model_criterion:
- - valid
- acc
- max
keep_nbest_models: 10
encoder: conformer
encoder_conf:
output_size: 512
attention_heads: 8
linear_units: 2048
num_blocks: 12
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d
normalize_before: true
macaron_style: true
pos_enc_layer_type: "rel_pos"
selfattention_layer_type: "rel_selfattn"
activation_type: "swish"
use_cnn_module: true
cnn_module_kernel: 31
decoder: transformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1
length_normalized_loss: false
optim: adam
optim_conf:
lr: 0.0015
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
frontend_conf:
n_fft: 512
hop_length: 256
specaug: specaug
specaug_conf:
apply_time_warp: true
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
num_freq_mask: 2
apply_time_mask: true
time_mask_width_range:
- 0
- 40
num_time_mask: 2
\ No newline at end of file
# network architecture
# encoder related
encoder: contextual_block_conformer # contextual_block_conformer is the core of streaming conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder architecture type
normalize_before: true
activation_type: swish
macaron_style: true
use_cnn_module: true
cnn_module_kernel: 15
block_size: 40 # streaming configuration
hop_size: 16 # streaming configuration
look_ahead: 16 # streaming configuration
init_average: true # streaming configuration
ctx_pos_enc: true # streaming configuration
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
# minibatch related
# batch_type: folded
# batch_size: 128
batch_type: numel
batch_bins: 50000000
#valid_batch_size: 1
# optimization related
accum_grad: 1
grad_clip: 5
patience: 3
max_epoch: 50
val_scheduler_criterion:
- valid
- acc
best_model_criterion:
- - valid
- acc
- max
early_stopping_criterion:
- valid
- cer_ctc
- min
keep_nbest_models: 10
# NoamLR is deprecated. Use WarmupLR.
# The following is equivalent setting for NoamLR:
#
# optim: adam
# optim_conf:
# lr: 10.
# scheduler: noamlr
# scheduler_conf:
# model_size: 256
# warmup_steps: 25000
#
optim: adam
optim_conf:
lr: 0.0005
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 30000
num_att_plot: 0
specaug: specaug
specaug_conf:
apply_time_warp: true
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
num_freq_mask: 2
apply_time_mask: true
time_mask_width_range:
- 0
- 40
num_time_mask: 2
\ No newline at end of file
# This configuration requires 4 GPUs with 32GB memory
batch_type: numel
batch_bins: 140000000
accum_grad: 6
max_epoch: 33
patience: none
# The initialization method for model parameters
init: xavier_uniform
best_model_criterion:
- - valid
- acc
- max
keep_nbest_models: 10
encoder: contextual_block_transformer
encoder_conf:
output_size: 512
attention_heads: 8
linear_units: 2048
num_blocks: 18
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d6
normalize_before: true
block_size: 40
hop_size: 16
look_ahead: 16
init_average: true
ctx_pos_enc: true
decoder: transformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1
length_normalized_loss: false
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
num_att_plot: 0
specaug: specaug
specaug_conf:
apply_time_warp: true
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
num_freq_mask: 2
apply_time_mask: true
time_mask_width_range:
- 0
- 40
num_time_mask: 2
# encoder related
encoder: transformer
encoder_conf:
output_size: 512
attention_heads: 8
linear_units: 2048
num_blocks: 12
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d
normalize_before: true
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
# minibatch related
# batch_type: folded
# batch_size: 32
# optimization related
batch_type: numel
batch_bins: 16000000
accum_grad: 4
# grad_clip : 5
max_epoch: 200
# patience: 3
patience: none
init: xavier_uniform
val_scheduler_criterion:
- valid
- acc
best_model_criterion:
- - valid
- acc
- max
keep_nbest_models: 10
# NoamLR is deprecated. Use WarmupLR.
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
specaug: specaug
specaug_conf:
apply_time_warp: true
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
num_freq_mask: 2
apply_time_mask: true
time_mask_width_range:
- 0
- 40
num_time_mask: 2
batch_type: numel
batch_bins: 15000000
accum_grad: 6
max_epoch: 100
patience: none
# The initialization method for model parameters
init: xavier_uniform
best_model_criterion:
- - valid
- acc
- max
keep_nbest_models: 10
encoder: transformer
encoder_conf:
output_size: 512
attention_heads: 8
linear_units: 2048
num_blocks: 18
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d6
normalize_before: true
decoder: transformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1
length_normalized_loss: false
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
specaug: specaug
specaug_conf:
apply_time_warp: true
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
num_freq_mask: 2
apply_time_mask: true
time_mask_width_range:
- 0
- 40
num_time_mask: 2
lm_conf:
nlayers: 4
unit: 2048
optim: adam
optim_conf:
lr: 0.001
batch_type: folded
batch_size: 400 # batch size in LM training
max_epoch: 20 # if the data size is large, we can reduce this
patience: 3
best_model_criterion:
- - valid
- loss
- min
keep_nbest_models: 1
# Trained with Nvidia TESLA V100, with 16GM RAM, x4
lm: transformer
lm_conf:
pos_enc: null
embed_unit: 128
att_unit: 512
head: 8
unit: 2048
layer: 16
dropout_rate: 0.1
# optimization related
grad_clip: 5.0
batch_type: numel
batch_bins: 2000000
accum_grad: 1
max_epoch: 15 # 15epoch is enougth
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
best_model_criterion:
- - valid
- loss
- min
keep_nbest_models: 10 # 10 is good.
\ No newline at end of file
../../TEMPLATE/asr1/db.sh
\ No newline at end of file
../asr1/decode.sh
\ No newline at end of file
#!/bin/bash
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail
log() {
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
SECONDS=0
stage=0
nj=32
stop_stage=100000
train_set=train_all
valid_set=dev_all
test_sets="librispeech-test_clean librispeech-test_other \
NER-Trs-Vol1-test NER-Trs-Vol2-test NER-Trs-Vol3-test NER-Trs-Vol4-test \
OC16-CE80 MATBN-test thchs30-test \
tat-vol1-test tat-vol2-test tat-edu-test"
lid=false # whether to use language id as additional label
use_noise=false
global_path=
nlsyms_txt=data/local/nlsyms.txt
log "$0 $*"
. ./path.sh || exit 1
. ./cmd.sh || exit 1
. utils/parse_options.sh || exit 1
# if [ $# -ne 0 ]; then
# log "Error: No positional arguments are required."
# exit 2
# fi
if [ ${stage} -le 1 ]; then
# combine the sub sets into the train_set
echo "Stage 0: Combine Multiple Train Data Source"
utils/combine_data.sh --extra-files utt2num_frames data/${train_set} \
data/train-data/{OC16-CE80,TCC300,aishell-train} \
data/train-data/NER-Trs-Vol{1,2,3,4}-train \
data/train-data/{librispeech-train_100,librispeech-train_360} \
data/train-data/{tat-vol1-train,tat-vol2-train,tat-edu}
if $lid; then
mv data/${train_set}/text data/${train_set}/text.bak
python3 tools/add_lid_tag.py \
-utt data/train-data/tat-vol1-train/utt2spk \
-utt data/train-data/tat-vol2-train/utt2spk \
-utt data/train-data/tat-tmp/utt2spk \
-utt data/train-data/G2019429-fix/utt2spk \
-utt data/train-data/G2019432-fix/utt2spk \
-utt data/train-data/G2019459-fix/utt2spk \
-utt data/train-data/G2019463-fix/utt2spk \
-utt data/train-data/G2019479-fix/utt2spk \
data/${train_set}/text.bak tools/taiwanese.v2.csv data/${train_set}/text
rm data/${train_set}/text.bak
utils/fix_data_dir.sh data/${train_set}
fi
# combine the sub sets into the dev_set
echo "Stage 0: Combine Multiple Dev Data Source"
utils/combine_data.sh --extra-files utt2num_frames data/${valid_set} \
data/train-data/NER-Trs-Vol{1,2,3,4}-eval \
data/train-data/{thchs30-dev,aishell-dev} \
data/train-data/{librispeech-dev_clean,librispeech-dev_other} \
data/train-data/{tat-vol1-dev,tat-vol2-dev}
# data/train-data/{NER-Trs-Vol1-eval,librispeech-dev_clean,tat-vol1-dev}
if $lid; then
mv data/${valid_set}/text data/${valid_set}/text.bak
python3 tools/add_lid_tag.py \
-utt data/train-data/tat-vol1-dev/utt2spk \
-utt data/train-data/tat-vol2-dev/utt2spk \
data/${valid_set}/text.bak tools/taiwanese.v2.csv data/${valid_set}/text
rm data/${valid_set}/text.bak
utils/fix_data_dir.sh data/${valid_set}
fi
if $use_noise; then
echo "Use FaNT to add noise"
train_set=${train_set}_noise
rirsdir=
noise_opt=()
dir_opt=()
dest_opt=()
# use FaNT to increase data diversity
noise_opt+=("/nfs/TS-1635AX/Corpora/musan")
noise_opt+=("/nfs/TS-1635AX/Corpora/NOISE_DATASETs/TRAIN")
for n in ${noise_opt[@]}; do
srcdir=data/train_all
_n=$(echo $n | awk -F'/' '{print $NF}')
local/multi_condition/perturb_data_dir_fant_convert.sh --nj $nj \
--noisedir $n \
$srcdir
dest_opt+=(${srcdir}_fant_${_n})
done
utils/data/combine_data.sh data/train_all_fant ${dest_opt[@]}
rm -r ${dest_opt[@]}
dir_opt+=(data/train_all_fant)
dir_opt+=(data/train_all)
# add RIRs, simulated RIRs, isotropic noises and point-source noises
if [[ -n "$rirsdir" ]]; then
srcdir=data/train_all_fant
samplerate=16000
# Make a version with reverberated speech
rvb_opts=()
rvb_opts+=(--rir-set-parameters "0.5, ${rirsdir}/simulated_rirs/smallroom/rir_list")
rvb_opts+=(--rir-set-parameters "0.5, ${rirsdir}/simulated_rirs/mediumroom/rir_list")
# Make a reverberated version of the SWBD+SRE list. Note that we don't add any
# additive noise here.
python3 steps/data/reverberate_data_dir.py "${rvb_opts[@]}" \
--prefix "reverb" \
--speech-rvb-probability 1 \
--pointsource-noise-addition-probability 0 \
--isotropic-noise-addition-probability 0 \
--num-replications 1 \
--source-sampling-rate $samplerate \
${srcdir} ${srcdir}_reverb
dir_opt+=(${srcdir}_reverb)
fi
utils/data/combine_data.sh data/${train_set} ${dir_opt[@]}
utils/fix_data_dir.sh data/${train_set}
fi
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ] && $lid; then
log "stage 2: Create Non-linguistic Symbols for Language ID"
cut -f 2- data/${train_set}/text | grep -o -P '\[.*?\]|\<.*?\>' | sort | uniq > ${nlsyms_txt}
log "save non-linguistic symbols in ${nlsyms_txt}"
fi
#if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# # use external data
# echo "$0: preparing extra corpus for subword LM training..."
# mkdir -p data/local/other_text
# local/lm/prepare_extra_text.sh --normjobs $nj \
# --global-path $global_path \
# data/local/lm data/local/lm/corpus || exit 1
# if [ ! -e data/local/other_text/text ]; then
# # provide utterance id to each texts
# find data/local/lm/norm -mindepth 1 -maxdepth 3 -type f | xargs cat > data/local/other_text/text
# fi
#fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
for test in ${test_sets}; do
cp -r data/test-data/${test} data
utils/fix_data_dir.sh data/${test}
if $lid; then
echo "Stage 4: add language ID to Dev Data Source"
mv data/${test}/text data/${test}/text.bak
python3 tools/add_lid_tag.py \
-utt data/test-data/tat-vol1-test/utt2spk \
-utt data/test-data/tat-vol2-test/utt2spk \
data/${test}/text.bak tools/taiwanese.csv data/${test}/text
rm data/${test}/text.bak
utils/fix_data_dir.sh data/${test}
fi
done
fi
log "Successfully finished. [elapsed=${SECONDS}s]"
# We only need the NCTU parser stuff below for the optional text normalization(for LM-training) step
PARSER_ROOT=tools/new_parser_UNICODE
export PATH=$PATH:$PARSER_ROOT
# FaNT is needed for noise/phone
FANT_ROOT=tools/fant
export PATH=$PATH:$FANT_ROOT
# g729a/b is needed for noise/phone
G729_ROOT=tools/g729a
export PATH=$PATH:$G729_ROOT
../../TEMPLATE/asr1/path.sh
\ No newline at end of file
../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
#!/bin/bash
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail
# E2E model related
train_set=train_all
valid_set=dev_all
test_sets="librispeech-test_clean librispeech-test_other \
tat-vol1-test tat-vol2-test tat-tmp-edu \
NER-Trs-Vol1-test NER-Trs-Vol2-test NER-Trs-Vol3-test NER-Trs-Vol4-test"
use_noise=false
global_path=`pwd`
lid=true # whether to use language id as additional label
nj=150
stage=0
stop_stage=10000
. ./path.sh
. ./cmd.sh
. ./utils/parse_options.sh
asr_config=conf/tuning/train_asr_conformer.yaml
lm_config=conf/tuning/train_lm_transformer.yaml
inference_config=conf/decode_asr.yaml
nlsyms_txt=data/local/nlsyms.txt
if "${use_noise}"; then train_set=${train_set}_noise; fi
#--use_streaming true \
./asr.sh \
--stage $stage \
--stop_stage $stop_stage \
--use_lm false \
--nj $nj \
--lang cht_eng_tw \
--ngpu 10 \
--num_nodes 1 \
--nbpe 20000 \
--token_type bpe \
--feats_type raw \
--audio_format wav \
--max_wav_duration 30 \
--speed_perturb_factors "0.9 1.0 1.1" \
--asr_config "${asr_config}" \
--lm_config "${lm_config}" \
--inference_config "${inference_config}" \
--local_data_opts "--global-path $global_path --nj $nj --nlsyms_txt ${nlsyms_txt} --lid ${lid}" \
--train_set "${train_set}" \
--valid_set "${valid_set}" \
--test_sets "${test_sets}" \
--bpe_nlsyms "[CHT],[EN],[TW]" \
--lm_train_text "data/${train_set}/text" \
--bpe_train_text "data/${train_set}/text" "$@" \
--local_score_opts "--score_lang_id ${lid}" "$@"
#!/bin/bash
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail
# E2E model related
train_set=train_all
valid_set=dev_all
test_sets="librispeech-test_clean librispeech-test_other \
NER-Trs-Vol1-test NER-Trs-Vol2-test NER-Trs-Vol3-test \
NER-Trs-Vol4-test OC16-CE80 MATBN-test thchs30-test"
use_noise=false
global_path=`pwd`
lid=true # whether to use language id as additional label
nj=100
. ./path.sh
. ./cmd.sh
. ./utils/parse_options.sh
asr_config=conf/tuning/train_asr_streaming_conformer.yaml
lm_config=conf/tuning/train_lm_transformer.yaml
inference_config=conf/decode_asr_streaming.yaml
nlsyms_txt=data/local/nlsyms.txt
if "${use_noise}"; then train_set=${train_set}_noise; fi
./decode.sh \
--stage 2 \
--stop_stage 10000 \
--use_streaming true \
--gpu_inference false \
--inference_nj $nj \
--use_lm false \
--nj $nj \
--lang cht_eng_tw.v2 \
--ngpu 10 \
--num_nodes 1 \
--nbpe 30000 \
--token_type bpe \
--feats_type raw \
--audio_format wav \
--speed_perturb_factors "0.9 1.0 1.1" \
--asr_config "${asr_config}" \
--lm_config "${lm_config}" \
--inference_config "${inference_config}" \
--local_data_opts "--global-path $global_path --stage 1" \
--train_set "${train_set}" \
--valid_set "${valid_set}" \
--test_sets "${test_sets}" \
--asr_speech_fold_length 512 \
--asr_text_fold_length 150 \
--lm_fold_length 150 \
--bpe_nlsyms "[CHT],[EN],[TW]" \
--lm_train_text "data/${train_set}/text" \
--bpe_train_text "data/${train_set}/text" "$@" \
--local_score_opts "--score_lang_id ${lid}" "$@"
../../TEMPLATE/asr1/scripts
\ No newline at end of file
../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
File added
import os, sys
import argparse
import csv
import re
def parse_opts():
parser = argparse.ArgumentParser(
description='Strips unhelpful, from LM viewpoint, strings from PG texts',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-utt', '--taiwanese-utt-file', action='append',
help='Split chinese word to char which use for writting the output text')
parser.add_argument('in_text', type=str, help='Input text file')
parser.add_argument('taiwanese_tab', type=str, help='Input taiwanese table file')
parser.add_argument('out_text', type=str, help='Filtered output text file')
opts = parser.parse_args()
return opts
def check_english(check_str):
check = False
for ch in check_str:
if (ch >= u'\u0041' and ch <= u'\u005A') or \
(ch >= u'\u0061' and ch <= u'\u007A') or ch == "'":
check = True
else:
return False
if check:
return True
def check_full_english(check_str):
check = False
for w in check_str.split():
if check_english(w):
check = True
else:
return False
if check:
return True
def check_contain_chinese(check_str):
check = False
RE_HANS = re.compile(
r'^(?:['
r'\u3100-\u312f' # Bopomofo
r'\u3400-\u4dbf' # CJK Ext.A:[3400-4DBF]
r'\u4e00-\u9fff' # CJK baise:[4E00-9FFF]
r'\uf900-\ufaff' # CJK Comp:[F900-FAFF]
r'\U00020000-\U0002A6DF' # CJK Ext.B:[20000-2A6DF]
r'\U0002A703-\U0002B73F' # CJK Ext.C:[2A700-2B73F]
r'\U0002B740-\U0002B81D' # CJK Ext.D:[2B740-2B81D]
r'\U0002F80A-\U0002FA1F' # CJK Comp:[2F800-2FA1F]
r'])+$'
)
for ch in check_str:
if RE_HANS.match(ch):
check = True
else:
return False
if check:
return True
def read_csv(path):
table = []
with open(path, newline='') as csvfile:
rows = csv.reader(csvfile)
rows = list(rows)
con = rows[0][2:]
table.extend(con)
for row in rows[2:]:
table.extend([ row[0]+_n for _n in con ])
return table
def check_taiwanese(check_str, table):
tone_list = [u"ā", u"á", u"ǎ", u"à", u"â", u"a̍", \
u"ē", u"é", u"ě", u"è", u"ê", u"e̍", \
u"ō", u"ó", u"ǒ", u"ò", u"ô", u"o̍", u"ő", \
u"ī", u"í", u"ǐ", u"ì", u"î", u"i̍", \
u"ū", u"ú", u"ǔ", u"ù", u"û", u"u̍", \
u"ń", u"ň", u"ǹ", u"n̍", \
u"m̄", u"ḿ", u"m̀", u"m̍"]
# tone_list = [u"ā", u"á", u"ǎ", u"à", u"â", \
# u"ē", u"é", u"ě", u"è", \
# u"ō", u"ó", u"ǒ", u"ò", u"ô", u"ő", \
# u"ī", u"í", u"ǐ", u"ì", u"i", u"î", \
# u"ū", u"ú", u"ǔ", u"ù", u"ü", u"ǖ", u"ǘ", u"ǚ" ,u"ǜ", u"û", \
# u"ń", u"ň", u"ǹ", \
# u"m̄", u"ḿ", u"m̀", \
# u"ê", u"ê̄", u"ế", u"ê̌", u"ề"]
if len(check_str.split()) == 2:
check = False
for _str in check_str.split():
if not ( any( [ _ in tone_list for _ in _str] ) or _str in table or check_contain_chinese(_str) ):
return False
return True
else:
return any([ _ in tone_list for _ in check_str] ) or check_str in table
if __name__ == '__main__':
opts = parse_opts()
tai_id = []
if opts.taiwanese_utt_file:
for f in opts.taiwanese_utt_file:
with open(f, "r") as rf:
tai_id.extend([ l.split()[0] for l in rf.readlines() ])
data = []
table = read_csv(opts.taiwanese_tab)
with open(opts.in_text, "r", encoding="utf-8") as rf:
for l in rf.readlines():
if len(l.split()) > 1:
u = l.split(maxsplit=1)[0]
# taiwanese mode
taiwanese_mode = False
if opts.taiwanese_utt_file:
if u in tai_id:
taiwanese_mode = True
t = ""
switchcode = ""
b_w = ""
try:
for w in l.split(maxsplit=1)[1].split():
if len(t) == 0:
if taiwanese_mode:
if check_taiwanese(w, table) or check_contain_chinese(w):
t += "[TW] @{} ".format(w)
switchcode = "tw"
else:
t += "[EN] {} ".format(w)
switchcode = "en"
else:
if check_contain_chinese(w):
t += "[CHT] {} ".format(w)
switchcode = "cht"
else:
t += "[EN] {} ".format(w)
switchcode = "en"
else:
if taiwanese_mode:
if switchcode == "tw":
if check_taiwanese("{} {}".format(b_w, w), table):
t += " @{} ".format(w)
else:
t += "[EN] {} ".format(w)
switchcode = "en"
else:
if check_full_english(b_w+w) and not check_taiwanese("{} {}".format(b_w, w), table):
t += " {} ".format(w)
else:
t += "[TW] @{} ".format(w)
switchcode = "tw"
else:
if switchcode == "cht":
if check_contain_chinese(b_w+w):
t += " {} ".format(w)
else:
t += "[EN] {} ".format(w)
switchcode = "en"
else:
if check_full_english(b_w+w):
t += " {} ".format(w)
else:
t += "[CHT] {} ".format(w)
switchcode = "cht"
#print(b_w+w, switchcode)
b_w = w
data.append("{} {}\n".format(u, " ".join(_t for _t in t.split()).strip()))
except Exception as e:
print(u, e)
with open(opts.out_text, "w", encoding="utf-8") as wf:
wf.writelines(data)
INDEX,NONE,a,ah,ai,aih,ainn,ak,am,an,ang,ann,annh,ap,at,au,auh,aunnh,e,eh,enn,ennh,i,ia,iah,iak,iam,ian,iang,iann,iannh,iap,iat,iau,iauh,ih,ik,im,in,ing,inn,io,ioh,iok,iong,ip,it,iu,iuh,iunn,m,mh,ng,ngh,o,oh,ok,om,ong,onn,onnh,oo,ooh,op,u,ua,uah,uai,uainn,uan,uang,uann,uat,ue,ueh,uh,ui,uinn,un,ut
NONE,,a,ah,ai,aih,ainn,ak,am,an,ang,ann,annh,ap,at,au,auh,aunnh,E,eh,enn,ennh,i,ia,iah,iak,iam,ian,iang,iann,iannh,iap,iat,iau,iauh,ih,ik,im,ien,ieng,inn,io,ioh,iok,iong,ip,it,iu,iuh,iunn,nm,mh,eng,ngh,e,oh,ok,om,ong,onn,onnh,o,ooh,op,u,ua,uah,uai,uainn,uan,uang,uann,uat,oe,ueh,uh,ui,uinn,uen,ut
p,b,b a,b ah,b ai,,,b ak,,b an,b ang,,,,b at,b au,,,b E,b eh,b enn,,b i,,b iah,b iak,,b ian,b iang,b iann,,,b iat,b iau,,b ih,b ik,,b ien,b ieng,b inn,b io,,,,,b it,b iu,,,,,b eng,,b e,b oh,b ok,,b ong,,,b o,,,b u,b ua,b uah,,,b uan,,b uann,b uat,b oe,b ush,b uh,b ui,,b uen,b ut
ph,p,p a,p ah,p ai,,p ainn,p ak,,p an,p ang,p ann,,,,p au,p auh,,p E,,p enn,,p i,,p iah,p iak,,p ian,p iang,p iann,,,p iat,p iau,,p ih,p ik,,p ien,p ieng,p inn,p io,,,,,p it,b iu,,,,,,p ngh,p e,p oh,p ok,,p ong,,,p o,,,p u,p ua,p uah,,,p uan,,p uann,p uat,p oe,p ueh,p uh,p ui,,p uen,p ut
m,m,m a,m ah,m ai,,,,,,,,,,,m au,m auh,,m E,m eh,,,m i,m ia,,,,,,,,,,m iau,,m ih,,,,,,,,,,,,,,,,,m eng,,,,,,,,,m o,m ooh,,,m ua,,m ue,,,,,,m oe,,,m ui,,,
b,bh,bh a,bh ah,bh ai,,,bh ak,,bh an,bh ang,,,,bh at,bh au,,,bh E,bh eh,,,bh i,,,,,bh ian,,,,,bh iat,bh iau,,bh ih,b ik,,bh ien,bh ieng,,bh io,,,,,bh it,bh iu,,,,,,,bh e,,bh ok,,bh ong,,,bh o,,,bh u,bh ua,bh uah,,,bh uan,,,bh uat,bh oe,bh ueh,,bh ui,,bh uen,bh ut
t,d,d a,d ah,d ai,,d ainn,d ak,d am,d an,d ang,d ann,,d ap,d at,d au,d auh,,d E,d eh,d enn,,d i,d ia,d iah,d iak,d iam,d ian,,d iann,,d iap,d iat,d iau,,d ih,d ik,d im,d ien,d ieng,d inn,d io,d ioh,d iok,d iong,,d it,diu,d iuh,d iunn,,,d eng,,d e,d oh,d ok,d om,d ong,,,d o,,,d u,d ua,d uah,,,d uan,,d uann,d uat,d oe,,d uh,d ui,,d uen,d ut
th,t,t a,t ah,t ai,,,t ak,t am,t an,t ang,t ann,,t ap,t at,t au,,,t E,t eh,t enn,,t i,,t iah,,t iam,t ian,,t iann,,t iap,t iat,t iau,,t ih,t ik,t im,t ien,t ieng,t inn,t io,,t iok,t iong,,tsh it,t iu,,,,,t eng,,t e,t oh,t ok,,t ong,,,t o,,,t u,t ua,t uah,,,t uan,,t uann,t uat,th E,,t uh,t ui,,t uen,t ut
n,n,n a,n ah,n ai,n ah,,,,,,,,,,n au,n auh,,n E,n eh,,,n i,n ia,,,,,,,,,,n iau,,n ih,,,,,,,,,,,,n iu,,,,,n eng,,ne,,,,,,,n o,,,,n ua,,,,,,,,,,,n ui,,,
l,l,l a,l ah,l ai,,,l ak,l am,l an,l ang,,,l ap,l at,l au,l auh,,l E,l eh,,,l i,,l iah,l iok,l iam,l ian,l iang,,,l iap,l iat,l iau,,l ih,l ik,l im,l ien,l ieng,,l io,l ioh,l iok,l iong,l ip,l it,l iu,,,,,,,l e,l oh,l ok,,l ong,,,l o,,l op,l u,l ua,l uah,,,l uan,,,l uat,l oe,,l uh,l ui,,l uen,l ut
k,g,g a,g ah,g ai,,g ainn,g ak,k am,g an,g ang,g ann,,g ap,g at,g au,g auh,,g E,g eh,g enn,,g i,g ia,g iah,,g iam,g ian,k iong,g iann,,g iap,g iat,g iua,,g ih,g ik,g im,g ien,g ieng,g inn,g io,g ioh,g iok,g iong,g ip,,g iu,,g iunn,,,g eng,,g e,g oh,g ok,,g ong,g onn,,g o,,,g u,g ua,g uah,g uai,g uainn,g uan,,g uann,g uat,g oe,g ueh,,g ui,,g uen,g ut
kh,k,k a,k ah,k ai,,k ainn,k ak,k am,k an,k ang,k ann,,k ap,k at,k au,,,k E,k eh,k enn,k ennh,k i,k ia,k iah,k iak,k iam,k ian,k iang,,,k iap,k iat,k iau,,k ih,k ik,k im,k ien,k ieg,k inn,k io,k ioh,k iok,k iong,k ip,k it,k iu,,k iunn,,,k eng,k ngh,k e,,k ok,,k ong,,,k o,,,k u,k ua,k uah,k uai,,k uan,,k uann,k uat,k oe,k ueh,k uh,k ui,k uinn,k uen,k ut
ng,ng,ng a,,ng ai,,,,,,,,,,,ng au,,,ng E,ng eh,,,,ng ia,,,,,,,,,,ng iau,ng iauh,,,,,,,,,,,,,ng iu,,,,,,,,,,,,,,ng o,,,,,,,,,,,,,,,,,,
h,h,h a,h ah,h ai,h aih,h ainn,h ak,h am,h an,h ang,h ann,h annh,h ap,h at,h au,,h aunnh,h E,h eh,,,h i,h ia,h iah,,h iam,h ian,h iang,h iann,h iannh,h iap,h iat,h iau,h iauh,,h ik,h im,h ien,h ieg,h inn,h io,h ioh,h iok,h iong,h ip,h it,h iu,,h iunn,h nm,h mh,h eng,h ngh,h e,h oh,h ok,,h ong,h onn,h onnh,h o,h ooh,,h u,h ua,h uah,h uai,h uainn,h uan,,h uann,h uat,h oe,h ugh,h uh,h ui,,h uen,h ut
g,gh,gh a,,gh ai,,,gh ak,gh am,gh an,gh ang,,,,,gh au,,,gh E,g eh,,,gh i,gh ia,gh iah,,gh iam,gh ian,gh iang,,,gh iap,gh iat,gh iau,,,gh ik,gh im,gh ien,gh ieng,,gh io,gh ioh,gh iok,gh iong,,,gh iu,,,,,,,gh e,,gh ok,,gh ong,,,gh o,,,gh u,gh ua,,,,gh uan,,,gh uat,gh oe,gh ueh,,gh ui,,gh ien,
ts,ts,ts a,ts ah,ts ai,,ts ainn,ts ak,ts am,ts an,ts ang,ts ann,,ts ap,ts at,ts au,,,ts E,ts eh,ts enn,,ts i,ts ia,ts iah,,ts iam,ts ian,ts iang,ts iann,,ts iap,ts iat,ts iau,,ts ih,ts ik,ts im,ts ien,ts ieng,ts inn,ts io,ts ioh,ts iok,ts iong,ts ip,ts it,ts iu,ts iuh,ts iunn,,,ts eng,,ts e,ts oh,ts ok,,ts ong,,,ts o,,,ts u,ts ua,ts uah,,ts uainn,ts uan,,ts uann,ts uat,ts oe,ts ueh,ts uh,ts ui,,ts uen,ts ut
tsh,tsh,tsh a,tsh ah,tsh ai,,,tsh ak,tsh am,tsh an,tsh ang,tsh ann,,tsh ap,tsh at,tsh au,tsh auh,,tsh E,tsh eh,tsh enn,,tsh i,tsh ia,tsh iah,tsh iak,tsh iam,tsh ian,tsh iang,tsh iann,,tsh iap,tsh iat,tsh iau,,tsh ih,tsh ik,tsh im,tsh ien,tsh ieng,tsh inn,tsh io,tsh ioh,tsh iok,tsh iong,x ip,tsh it,tsh iu,,tsh iunn,,,tsh eng,,tsh e,tsh oh,tsh ok,,tsh ong,,,tsh o,,,tsh u,tsh ua,tsh uah,,,tsh uan,tsh uang,tsh uann,,tsh oe,,tsh uh,tsh ui,,tsh uen,tsh ut
s,s,s a,s ah,s ai,,,s ak,s ma,s an,s ang,s ann,s annh,s ap,s at,s au,,,s E,s eh,s enn,,s i,s ia,s iah,s iak,s iam,s ian,s iang,s iann,s iann,s iap,s iat,s iau,,s ih,s ik,s im,s ien,s ieng,s inn,s io,s ioh,s iok,s iong,s ip,s it,s iu,,s iunn,,,s eng,,s e,s oh,s ok,s om,s ong,,,s o,,,s u,s ua,s uah,,s uainn,s uan,,s uann,s uat,s oe,s ueh,s uh,s ui,,s uen,s ut
j,nj,,,,,,,,,,,,,,,,,,,,,j i,j ia,j iah, ,j iam,j ian,j dang,,,j iap,j iat,j iau,,tsh ih,,j im,j ien,,,j io,,j iok,j iong,j ip,j it,j iu,,,,,,,,,,,,,,,,,j u,,j uah,,,,,,,j oe,,,,,j uen,
INDEX,NONE,a,ah,ai,aih,ainn,ak,am,an,ang,ann,annh,ap,at,au,auh,aunnh,e,eh,enn,ennh,i,ia,iah,iak,iam,ian,iang,iann,iannh,iap,iat,iau,iauh,ih,ik,im,in,ing,inn,io,ioh,iok,iong,ionn,ip,it,iu,iuh,iunn,m,mh,ng,ngh,o,oh,ok,om,ong,onn,oo,ooh,op,u,ua,uah,uai,uainn,uan,uann,uat,ue,ueh,uh,ui,uinn,un,ut
NONE,,a,ah,ai,aih,ainn,ak,am,an,ang,ann,annh,ap,at,au,auh,aunnh,e,eh,enn,ennh,i,ia,iah,iak,iam,ian,iang,iann,iannh,iap,iat,iau,iauh,ih,ik,im,in,ing,inn,io,ioh,iok,iong,ionn,ip,it,iu,iuh,iunn,m,mh,ng,ngh,o,oh,ok,om,ong,onn,oo,ooh,op,u,ua,uah,uai,uainn,uan,uann,uat,ue,ueh,uh,ui,kh uinn,un,ut
p,p,p a,p ah,p ai,,,p ak,,p an,p ang,,,,p at,p au,,,p e,p eh,p enn,,p i,,p iah,p iak,,p ian,p iang,p iann,,,p iat,p iau,,p ih,p ik,,p in,p ing,p inn,p io,,,,,,p it,p iu,,,,,p ng,,p o,p oh,p ok,,p ong,,p oo,,,p u,p ua,p uah,,,p uan,p uann,p uat,p ue,p ueh,p uh,p ui,,p un,p ut
ph,ph,ph a,ph ah,ph ai,,ph ainn,ph ak,,ph an,ph ang,ph ann,,,,ph au,ph auh,,ph e,,ph enn,,ph i,,ph iah,ph iak,,ph ian,ph iang,ph iann,,,ph iat,ph iau,,ph ih,ph ik,,ph in,ph ing,ph inn,ph io,,,,,,ph it,,,,,,,ph ngh,ph o,ph oh,ph ok,,ph ong,,ph oo,,,ph u,ph ua,ph uah,,,ph uan,ph uann,ph uat,ph ue,ph ueh,ph uh,ph ui,,ph un,ph ut
b,b,b a,b ah,b ai,,,b ak,,b an,b ang,,,,b at,b au,,,b e,b eh,,,b i,,,,,b ian,,,,,b iat,b iau,,b ih,b ik,,b in,b ing,,b io,,,,,,b it,b iu,,,,,,,b o,,b ok,,b ong,,b oo,,,b u,b ua,b uah,,,b uan,,b uat,b ue,b ueh,,b ui,,b un,b ut
m,m,m a,m ah,m ai,,,,,,,,,,,m au,m auh,,m e,m eh,,,m i,m ia,,,,,,,,,,m iau,,m ih,,,,,,,,,,,,,,,,,,m ng,,,,,,,,m oo,m ooh,,,m ua,,,,,,,m ue,,,m ui,,,
t,t,t a,t ah,t ai,,t ainn,t ak,t am,t an,t ang,t ann,,t ap,t at,t au,t auh,,t e,t eh,t enn,,t i,t ia,t iah,t iak,t iam,t ian,,t iann,,t iap,t iat,t iau,,t ih,t ik,t im,t in,t ing,t inn,t io,t ioh,t iok,t iong,t iunn,,t it,t iu,t iuh,t iunn,,,t ng,,t o,t oh,t ok,t om,t ong,,t oo,,,t u,t ua,t uah,,,t uan,t uann,t uat,t ue,,t uh,t ui,,t un,t ut
th,th,th a,th ah,th ai,,,th ak,th am,th an,th ang,th ann,,th ap,th at,th au,,,th e,th eh,th enn,,th i,,th iah,,th iam,th ian,,th iann,,th iap,th iat,th iau,,th ih,th ik,th im,th in,th ing,th inn,th io,,th iok,th iong,,,,th iu,,,,,th ng,,th o,th oh,th ok,,th ong,,th oo,,,th u,th ua,th uah,,,th uan,th uann,th uat,,,th uh,th ui,,th un,th ut
n,n,n a,n ah,n ai,,,,,,,,,,,n au,n auh,,n e,n eh,,,n i,n ia,,,,,,,,,,n iau,,n ih,,,,,,,,,,,,,n iu,,,,,n ng,,,,,,,,n oo,,,,n ua,,,,,,,,,,,,,
l,l,l a,l ah,l ai,,,l ak,l am,l an,l ang,,,l ap,l at,l au,l auh,,l e,l eh,,,l i,,l iah,,l iam,l ian,l iang,,,l iap,l iat,l iau,,l ih,l ik,l im,l in,l ing,,l io,l ioh,l iok,l iong,,l ip,l it,l iu,,,,,,,l o,l oh,l ok,,l ong,,l oo,l ooh,l op,l u,l ua,l uah,,,l uan,,l uat,l ue,,l uh,l ui,,l un,l ut
k,k,k a,k ah,k ai,,k ainn,k ak,k am,k an,k ang,k ann,,k ap,k at,k au,k auh,,k e,k eh,k enn,,k i,k ia,k iah,,k iam,k ian,,k iann,,k iap,k iat,k iau,,k ih,k ik,k im,k in,k ing,k inn,k io,k ioh,k iok,k iong,,k ip,,k iu,,k iunn,,,k ng,,k o,k oh,k ok,,k ong,k onn,k oo,,,k u,k ua,k uah,k uai,k uainn,k uan,k uann,k uat,k ue,k ueh,,k ui,,k un,k ut
kh,kh,kh a,kh ah,kh ai,,kh ainn,kh ak,kh am,kh an,kh ang,kh ann,,kh ap,kh at,kh au,,kh aunnh,kh e,kh eh,kh enn,kh ennh,kh i,kh ia,kh iah,kh iak,kh iam,kh ian,kh iang,,,kh iap,kh iat,kh iau,,kh ih,kh ik,kh im,kh in,kh ing,kh inn,kh io,kh ioh,kh iok,kh iong,,kh ip,kh it,kh iu,,kh iunn,,,kh ng,kh ngh,kh o,,kh ok,,kh ong,,kh oo,,,kh u,kh ua,kh uah,kh uai,,kh uan,kh uann,kh uat,kh ue,kh ueh,kh uh,kh ui,kh uinn,kh un,kh ut
g,g,g a,,g ai,,,g ak,g am,g an,g ang,,,,,g au,,,g e,,,,g i,g ia,g iah,,g iam,g ian,g iang,,,g iap,g iat,g iau,,,g ik,g im,g in,g ing,,g io,g ioh,g iok,g iong,,,,g iu,,,,,,,g o,,g ok,,g ong,,g oo,,,g u,g ua,,,,g uan,,g uat,g ue,g ueh,,g ui,,g uan,
ng,ng,ng a,,ng ai,,,,,,,,,,,ng au,,,ng e,ng eh,,,,ng ia,,,,,,,,,,ng iau,ng iauh,,,,,,,,,,,,,,ng iu,,,,,,,,,,,,,ng oo,,,,,,,,,,,,,,,,,
h,h,h a,h ah,h ai,h aih,h ainn,h ak,h am,h an,h ang,h ann,h annh,h ap,h at,h au,,,h e,h eh,,,h i,h ia,h iah,,h iam,h ian,h iang,h iann,h iannh,h iap,h iat,h iau,h iauh,,h ik,h im,h in,h ing,h inn,h io,h ioh,h iok,h iong,,h ip,h it,h iu,,h iunn,h m,h mh,h ng,h ngh,h o,h oh,h ok,,h ong,h onn,h oo,h ooh,,h u,h ua,h uah,h uai,h uainn,h uan,h uann,h uat,h ue,h ueh,,h ui,,h un,h ut
ts,ts,ts a,ts ah,ts ai,,ts ainn,ts ak,ts am,ts an,ts ang,ts ann,,ts ap,ts at,ts au,,,ts e,ts eh,ts enn,,ts i,ts ia,ts iah,,ts iam,ts ian,ts iang,ts iann,,ts iap,ts iat,ts iau,,ts ih,ts ik,ts im,ts in,ts ing,ts inn,ts io,ts ioh,ts iok,ts iong,,ts ip,ts it,ts iu,ts iuh,ts iunn,,,ts ng,,ts o,ts oh,ts ok,,ts ong,,ts oo,,,ts u,ts ua,ts uah,,ts uainn,ts uan,ts uann,ts uat,ts ue,,ts uh,ts ui,,ts un,ts ut
tsh,tsh,tsh a,tsh ah,tsh ai,,,tsh ak,tsh am,tsh an,tsh ang,tsh ann,,tsh ap,tsh at,tsh au,tsh auh,,tsh e,tsh eh,tsh enn,,tsh i,tsh ia,tsh iah,tsh iak,tsh iam,tsh ian,tsh iang,tsh iann,,tsh iap,tsh iat,tsh iau,,tsh ih,tsh ik,tsh im,tsh in,tsh ing,tsh inn,tsh io,tsh ioh,tsh iok,tsh iong,,tsh ip,tsh it,tsh iu,,tsh iunn,,,tsh ng,,tsh o,tsh oh,tsh ok,,tsh ong,,tsh oo,,,tsh u,tsh ua,tsh uah,,,tsh uan,tsh uann,,tsh ue,,tsh uh,tsh ui,,tsh un,tsh ut
s,s,s a,s ah,s ai,,,s ak,s am,s an,s ang,s ann,s annh,s ap,s at,s au,,,s e,s eh,s enn,,s i,s ia,s iah,s iak,s iam,s ian,s iang,s iann,,s iap,s iat,s iau,,s ih,s ik,s im,s in,s ing,s inn,s io,s ioh,s iok,s iong,s iunn,s ip,s it,s iu,,s iunn,,,s ng,,s o,s oh,s ok,s om,s ong,,s oo,,,s u,s ua,s uah,,s uainn,s uan,s uann,s uat,s ue,s ueh,s uh,s ui,,s un,s ut
j,j,,,,,,,,,,,,,,,,,,,,,j i,j ia,j iah, ,j iam,j ian,j iang,,,j iap,j iat,j iau,,,,j im,j in,,,j io,,j iok,j iong,,j ip,j it,j iu,,,,,,,,,,,,,,,,j u,,j uah,,,,,,j ue,,,,,j un,
\ No newline at end of file
../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment