Commit 61f0cf5e by ntut

Initial commit

parents
../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
# e.g.
# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
#
# Options:
# --time <time>: Limit the maximum time to execute.
# --mem <mem>: Limit the maximum memory usage.
# -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
# --num-threads <ngpu>: Specify the number of CPU core.
# --gpu <ngpu>: Specify the number of GPU devices.
# --config: Change the configuration file from default.
#
# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
#
# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
# These options are mapping to specific options for each backend and
# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
# If jobs failed, your configuration might be wrong for your environment.
#
#
# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
# "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
# =========================================================~
# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
cmd_backend='sge'
# Local machine, without any Job scheduling system
if [ "${cmd_backend}" = local ]; then
# The other usage
export train_cmd="run.pl"
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
export cuda_cmd="run.pl"
# Used for "*_recog.py"
export decode_cmd="run.pl"
# Local machine logging to stdout and log file, without any Job scheduling system
elif [ "${cmd_backend}" = stdout ]; then
# The other usage
export train_cmd="stdout.pl"
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
export cuda_cmd="stdout.pl"
# Used for "*_recog.py"
export decode_cmd="stdout.pl"
# "qsub" (Sun Grid Engine, or derivation of it)
elif [ "${cmd_backend}" = sge ]; then
# The default setting is written in conf/queue.conf.
# You must change "-q g.q" for the "queue" for your environment.
# To know the "queue" names, type "qhost -q"
# Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
export train_cmd="queue.pl --mem 20G --config conf/queue.conf"
export cuda_cmd="queue.pl --mem 20G --config conf/queue.conf"
export decode_cmd="queue.pl"
# "qsub" (Torque/PBS.)
elif [ "${cmd_backend}" = pbs ]; then
# The default setting is written in conf/pbs.conf.
export train_cmd="pbs.pl"
export cuda_cmd="pbs.pl"
export decode_cmd="pbs.pl"
# "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
# To know the "partion" names, type "sinfo".
# You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
export train_cmd="slurm.pl"
export cuda_cmd="slurm.pl"
export decode_cmd="slurm.pl"
elif [ "${cmd_backend}" = ssh ]; then
# You have to create ".queue/machines" to specify the host to execute jobs.
# e.g. .queue/machines
# host1
# host2
# host3
# Assuming you can login them without any password, i.e. You have to set ssh keys.
export train_cmd="ssh.pl"
export cuda_cmd="ssh.pl"
export decode_cmd="ssh.pl"
# This is an example of specifying several unique options in the JHU CLSP cluster setup.
# Users can modify/add their own command options according to their cluster environments.
elif [ "${cmd_backend}" = jhu ]; then
export train_cmd="queue.pl --mem 2G"
export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
export decode_cmd="queue.pl --mem 4G"
else
echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
return 1
fi
lm_weight: 0.6
ctc_weight: 0.4
beam_size: 3
penalty: 0.0
maxlenratio: 0.0
minlenratio: 0.0
streaming: True
tuning/decode_asr_streaming.yaml
\ No newline at end of file
--sample-frequency=16000
--num-mel-bins=80
# Default configuration
command qsub -V -v PATH -S /bin/bash
option name=* -N $0
option mem=* -l mem=$0
option mem=0 # Do not add anything to qsub_opts
option num_threads=* -l ncpus=$0
option num_threads=1 # Do not add anything to qsub_opts
option num_nodes=* -l nodes=$0:ppn=1
default gpu=0
option gpu=0
option gpu=* -l ngpus=$0
--sample-frequency=16000
# Default configuration
command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
option name=* -N $0
option mem=* -l mem_free=$0,ram_free=$0
option mem=0 # Do not add anything to qsub_opts
option num_threads=* -pe smp $0
option num_threads=1 # Do not add anything to qsub_opts
option max_jobs_run=* -tc $0
option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1
default gpu=0
option gpu=0
option gpu=* -l gpu=$0 -q Default
# Default configuration
command sbatch --export=PATH
option name=* --job-name $0
option time=* --time $0
option mem=* --mem-per-cpu $0
option mem=0
option num_threads=* --cpus-per-task $0
option num_threads=1 --cpus-per-task 1
option num_nodes=* --nodes $0
default gpu=0
option gpu=0 -p cpu
option gpu=* -p gpu --gres=gpu:$0 -c $0 # Recommend allocating more CPU than, or equal to the number of GPU
# note: the --max-jobs-run option is supported as a special case
# by slurm.pl and you don't have to handle it in the config file.
tuning/train_asr_streaming_transformer.yaml
\ No newline at end of file
tuning/train_asr_transformer.yaml
\ No newline at end of file
tuning/train_lm_adam.yaml
\ No newline at end of file
tuning/train_lm_transformer.yaml
\ No newline at end of file
# lm_weight: 0.3
beam_size: 10
penalty: 0.0
maxlenratio: 0.0
minlenratio: 0.0
ctc_weight: 0.3
sim_chunk_length: 512
disable_repetition_detection: true
decoder_text_length_limit: 0
encoded_feat_length_limit: 0
batch_type: numel
batch_bins: 140000000
accum_grad: 1
max_epoch: 60
patience: none
init: xavier_uniform
val_scheduler_criterion:
- valid
- acc
best_model_criterion:
- - valid
- acc
- max
keep_nbest_models: 10
encoder: conformer
encoder_conf:
output_size: 512
attention_heads: 8
linear_units: 2048
num_blocks: 12
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d
normalize_before: true
macaron_style: true
pos_enc_layer_type: "rel_pos"
selfattention_layer_type: "rel_selfattn"
activation_type: "swish"
use_cnn_module: true
cnn_module_kernel: 31
decoder: transformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1
length_normalized_loss: false
optim: adam
optim_conf:
lr: 0.0015
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
frontend_conf:
n_fft: 512
hop_length: 256
specaug: specaug
specaug_conf:
apply_time_warp: true
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
num_freq_mask: 2
apply_time_mask: true
time_mask_width_range:
- 0
- 40
num_time_mask: 2
\ No newline at end of file
# network architecture
# encoder related
encoder: contextual_block_conformer # contextual_block_conformer is the core of streaming conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder architecture type
normalize_before: true
activation_type: swish
macaron_style: true
use_cnn_module: true
cnn_module_kernel: 15
block_size: 40 # streaming configuration
hop_size: 16 # streaming configuration
look_ahead: 16 # streaming configuration
init_average: true # streaming configuration
ctx_pos_enc: true # streaming configuration
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
# minibatch related
# batch_type: folded
# batch_size: 128
batch_type: numel
batch_bins: 50000000
#valid_batch_size: 1
# optimization related
accum_grad: 1
grad_clip: 5
patience: 3
max_epoch: 50
val_scheduler_criterion:
- valid
- acc
best_model_criterion:
- - valid
- acc
- max
early_stopping_criterion:
- valid
- cer_ctc
- min
keep_nbest_models: 10
# NoamLR is deprecated. Use WarmupLR.
# The following is equivalent setting for NoamLR:
#
# optim: adam
# optim_conf:
# lr: 10.
# scheduler: noamlr
# scheduler_conf:
# model_size: 256
# warmup_steps: 25000
#
optim: adam
optim_conf:
lr: 0.0005
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 30000
num_att_plot: 0
specaug: specaug
specaug_conf:
apply_time_warp: true
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
num_freq_mask: 2
apply_time_mask: true
time_mask_width_range:
- 0
- 40
num_time_mask: 2
\ No newline at end of file
# This configuration requires 4 GPUs with 32GB memory
batch_type: numel
batch_bins: 140000000
accum_grad: 6
max_epoch: 33
patience: none
# The initialization method for model parameters
init: xavier_uniform
best_model_criterion:
- - valid
- acc
- max
keep_nbest_models: 10
encoder: contextual_block_transformer
encoder_conf:
output_size: 512
attention_heads: 8
linear_units: 2048
num_blocks: 18
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d6
normalize_before: true
block_size: 40
hop_size: 16
look_ahead: 16
init_average: true
ctx_pos_enc: true
decoder: transformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1
length_normalized_loss: false
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
num_att_plot: 0
specaug: specaug
specaug_conf:
apply_time_warp: true
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
num_freq_mask: 2
apply_time_mask: true
time_mask_width_range:
- 0
- 40
num_time_mask: 2
# encoder related
encoder: transformer
encoder_conf:
output_size: 512
attention_heads: 8
linear_units: 2048
num_blocks: 12
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d
normalize_before: true
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
# minibatch related
# batch_type: folded
# batch_size: 32
# optimization related
batch_type: numel
batch_bins: 16000000
accum_grad: 4
# grad_clip : 5
max_epoch: 200
# patience: 3
patience: none
init: xavier_uniform
val_scheduler_criterion:
- valid
- acc
best_model_criterion:
- - valid
- acc
- max
keep_nbest_models: 10
# NoamLR is deprecated. Use WarmupLR.
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
specaug: specaug
specaug_conf:
apply_time_warp: true
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
num_freq_mask: 2
apply_time_mask: true
time_mask_width_range:
- 0
- 40
num_time_mask: 2
batch_type: numel
batch_bins: 15000000
accum_grad: 6
max_epoch: 100
patience: none
# The initialization method for model parameters
init: xavier_uniform
best_model_criterion:
- - valid
- acc
- max
keep_nbest_models: 10
encoder: transformer
encoder_conf:
output_size: 512
attention_heads: 8
linear_units: 2048
num_blocks: 18
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d6
normalize_before: true
decoder: transformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1
length_normalized_loss: false
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
specaug: specaug
specaug_conf:
apply_time_warp: true
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
num_freq_mask: 2
apply_time_mask: true
time_mask_width_range:
- 0
- 40
num_time_mask: 2
lm_conf:
nlayers: 4
unit: 2048
optim: adam
optim_conf:
lr: 0.001
batch_type: folded
batch_size: 400 # batch size in LM training
max_epoch: 20 # if the data size is large, we can reduce this
patience: 3
best_model_criterion:
- - valid
- loss
- min
keep_nbest_models: 1
# Trained with Nvidia TESLA V100, with 16GM RAM, x4
lm: transformer
lm_conf:
pos_enc: null
embed_unit: 128
att_unit: 512
head: 8
unit: 2048
layer: 16
dropout_rate: 0.1
# optimization related
grad_clip: 5.0
batch_type: numel
batch_bins: 2000000
accum_grad: 1
max_epoch: 15 # 15epoch is enougth
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
best_model_criterion:
- - valid
- loss
- min
keep_nbest_models: 10 # 10 is good.
\ No newline at end of file
../../TEMPLATE/asr1/db.sh
\ No newline at end of file
../asr1/decode.sh
\ No newline at end of file
#!/bin/bash
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail
log() {
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
SECONDS=0
stage=0
nj=32
stop_stage=100000
train_set=train_all
valid_set=dev_all
test_sets="tat-vol1-test tat-vol2-test tat-tmp-test"
use_noise=false
global_path=
log "$0 $*"
. ./path.sh || exit 1
. ./cmd.sh || exit 1
. utils/parse_options.sh || exit 1
if [ $# -ne 0 ]; then
log "Error: No positional arguments are required."
exit 2
fi
if [ ${stage} -le 1 ]; then
# combine the sub sets into the train_set
echo "Stage 0: Combine Multiple Train Data Source"
utils/combine_data.sh --extra-files utt2num_frames data/${train_set} \
data/train-data/{tat-vol1-train,tat-vol2-train,tat-tmp}
# combine the sub sets into the dev_set
echo "Stage 0: Combine Multiple Dev Data Source"
utils/combine_data.sh --extra-files utt2num_frames data/${valid_set} \
data/train-data/{tat-vol1-dev,tat-vol2-dev}
if $use_noise; then
echo "Use FaNT to add noise"
train_set=${train_set}_noise
rirsdir=
noise_opt=()
dir_opt=()
dest_opt=()
# use FaNT to increase data diversity
noise_opt+=("/nfs/TS-1635AX/Corpora/DNS-Challenge")
for n in ${noise_opt[@]}; do
srcdir=data/train_all
_n=$(echo $n | awk -F'/' '{print $NF}')
local/multi_condition/perturb_data_dir_fant_convert.sh --nj $nj \
--noisedir $n \
$srcdir
dest_opt+=(${srcdir}_fant_${_n})
done
utils/data/combine_data.sh data/train_all_fant ${dest_opt[@]}
rm -r ${dest_opt[@]}
dir_opt+=(data/train_all_fant)
dir_opt+=(data/train_all)
# add RIRs, simulated RIRs, isotropic noises and point-source noises
if [[ -n "$rirsdir" ]]; then
srcdir=data/train_all_fant
samplerate=16000
# Make a version with reverberated speech
rvb_opts=()
rvb_opts+=(--rir-set-parameters "0.5, ${rirsdir}/simulated_rirs/smallroom/rir_list")
rvb_opts+=(--rir-set-parameters "0.5, ${rirsdir}/simulated_rirs/mediumroom/rir_list")
# Make a reverberated version of the SWBD+SRE list. Note that we don't add any
# additive noise here.
python3 steps/data/reverberate_data_dir.py "${rvb_opts[@]}" \
--prefix "reverb" \
--speech-rvb-probability 1 \
--pointsource-noise-addition-probability 0 \
--isotropic-noise-addition-probability 0 \
--num-replications 1 \
--source-sampling-rate $samplerate \
${srcdir} ${srcdir}_reverb
dir_opt+=(${srcdir}_reverb)
fi
utils/data/combine_data.sh data/${train_set} ${dir_opt[@]}
utils/fix_data_dir.sh data/${train_set}
fi
fi
#if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# # use external data
# echo "$0: preparing extra corpus for subword LM training..."
# mkdir -p data/local/other_text
# local/lm/prepare_extra_text.sh --normjobs $nj \
# --global-path $global_path \
# data/local/lm data/local/lm/corpus || exit 1
# if [ ! -e data/local/other_text/text ]; then
# # provide utterance id to each texts
# find data/local/lm/norm -mindepth 1 -maxdepth 3 -type f | xargs cat > data/local/other_text/text
# fi
#fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
for f in ${test_sets}; do
cp -r data/test-data/${f} data
utils/fix_data_dir.sh data/${f}
done
fi
log "Successfully finished. [elapsed=${SECONDS}s]"
# We only need the NCTU parser stuff below for the optional text normalization(for LM-training) step
PARSER_ROOT=tools/new_parser_UNICODE
export PATH=$PATH:$PARSER_ROOT
# FaNT is needed for noise/phone
FANT_ROOT=tools/fant
export PATH=$PATH:$FANT_ROOT
# g729a/b is needed for noise/phone
G729_ROOT=tools/g729a
export PATH=$PATH:$G729_ROOT
../../TEMPLATE/asr1/path.sh
\ No newline at end of file
../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
#!/bin/bash
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail
# E2E model related
train_set=train_all
valid_set=dev_all
test_sets="tat-vol1-test tat-vol2-test tat-tmp-test"
use_noise=false
global_path=`pwd`
nj=150
stage=0
stop_stage=10000
. ./path.sh
. ./cmd.sh
. ./utils/parse_options.sh
asr_config=conf/tuning/train_asr_streaming_conformer.yaml
lm_config=conf/tuning/train_lm_transformer.yaml
inference_config=conf/decode_asr_streaming.yaml
if "${use_noise}"; then train_set=${train_set}_noise; fi
./asr.sh
--stage $stage \
--stop_stage $stop_stage \
--use_streaming true \
--use_lm false \
--nj $nj \
--lang tw \
--ngpu 10 \
--num_nodes 1 \
--nbpe 5000 \
--token_type word \
--feats_type raw \
--audio_format wav \
--max_wav_duration 30 \
--speed_perturb_factors "0.9 1.0 1.1" \
--asr_config "${asr_config}" \
--lm_config "${lm_config}" \
--inference_config "${inference_config}" \
--local_data_opts "--global-path $global_path --nj $nj --stage 1" \
--train_set "${train_set}" \
--valid_set "${valid_set}" \
--test_sets "${test_sets}" \
--lm_train_text "data/${train_set}/text" \
--bpe_train_text "data/${train_set}/text" "$@"
#!/bin/bash
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail
# E2E model related
train_set=train_all
valid_set=dev_all
test_sets="tat-vol1-test tat-vol2-test tat-tmp-test"
ßuse_noise=false
global_path=`pwd`
nj=100
. ./path.sh
. ./cmd.sh
. ./utils/parse_options.sh
asr_config=conf/tuning/train_asr_streaming_conformer.yaml
lm_config=conf/tuning/train_lm_transformer.yaml
inference_config=conf/decode_asr_streaming.yaml
if "${use_noise}"; then train_set=${train_set}_noise; fi
./decode.sh \
--stage 0 \
--stop_stage 10000 \
--use_streaming true \
--gpu_inference false \
--inference_nj $nj \
--use_lm false \
--nj $nj \
--lang tw \
--ngpu 10 \
--num_nodes 1 \
--nbpe 5000 \
--token_type word \
--feats_type raw \
--audio_format wav \
--speed_perturb_factors "0.9 1.0 1.1" \
--asr_config "${asr_config}" \
--lm_config "${lm_config}" \
--inference_config "${inference_config}" \
--local_data_opts "--global-path $global_path --stage 1" \
--train_set "${train_set}" \
--valid_set "${valid_set}" \
--test_sets "${test_sets}" \
--asr_speech_fold_length 512 \
--asr_text_fold_length 150 \
--lm_fold_length 150 \
--lm_train_text "data/${train_set}/text"
../../TEMPLATE/asr1/scripts
\ No newline at end of file
../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment