From ce12eafd2d7daa229fdac13cb0be62806cb6938c Mon Sep 17 00:00:00 2001 From: mingyang Date: Fri, 5 Jan 2024 10:37:14 +0800 Subject: [PATCH 01/12] Add using of speaker embedding --- models/base/base_dataset.py | 16 ++++++++++++++++ modules/encoder/condition_encoder.py | 7 +++++++ processors/acoustic_extractor.py | 7 +++++++ 3 files changed, 30 insertions(+) diff --git a/models/base/base_dataset.py b/models/base/base_dataset.py index 8c1216a2..769df5c1 100644 --- a/models/base/base_dataset.py +++ b/models/base/base_dataset.py @@ -51,6 +51,20 @@ def __init__(self, cfg, dataset, is_valid=False): utt, spk = line.strip().split("\t") self.utt2spk[utt] = spk + if cfg.preprocess.use_spkemb: + self.utt2spk_path = {} + for utt_info in self.metadata: + dataset = utt_info["Dataset"] + uid = utt_info["Uid"] + utt = "{}_{}".format(dataset, uid) + + self.utt2spk_path[utt] = os.path.join( + cfg.preprocess.processed_dir, + dataset, + cfg.preprocess.speaker_dir, + uid + ".npy", + ) + if cfg.preprocess.use_uv: self.utt2uv_path = {} for utt_info in self.metadata: @@ -208,6 +222,8 @@ def __getitem__(self, index): single_feature["spk_id"] = np.array( [self.spk2id[self.utt2spk[utt]]], dtype=np.int32 ) + if self.cfg.preprocess.use_spkemb: + single_feature["spkemb"] = np.load(self.utt2spk_path[utt]) if self.cfg.preprocess.use_mel: mel = np.load(self.utt2mel_path[utt]) diff --git a/modules/encoder/condition_encoder.py b/modules/encoder/condition_encoder.py index 1600d078..db7d82c6 100644 --- a/modules/encoder/condition_encoder.py +++ b/modules/encoder/condition_encoder.py @@ -177,6 +177,8 @@ def __init__(self, cfg): ### Speaker Features ### if cfg.use_spkid: self.singer_encoder = SingerEncoder(self.cfg) + if cfg.use_spkemb: + self.speaker_project = nn.Linear(self.cfg.spkemb_dim, self.cfg.content_encoder_dim) def forward(self, x): outputs = [] @@ -232,6 +234,11 @@ def forward(self, x): singer_info = speaker_enc_out.expand(-1, seq_len, -1) outputs.append(singer_info) + if "spkemb" in x.keys(): + speaker_embedding = self.speaker_project(x["spkemb"].unsqueeze(1)) # [b, 1, 384] + speaker_embedding = speaker_embedding.expand(-1, seq_len, -1) + outputs.append(speaker_embedding) + encoder_output = None if self.merge_mode == "concat": encoder_output = torch.cat(outputs, dim=-1) diff --git a/processors/acoustic_extractor.py b/processors/acoustic_extractor.py index 9c4d9be7..5e0d7551 100644 --- a/processors/acoustic_extractor.py +++ b/processors/acoustic_extractor.py @@ -8,6 +8,7 @@ import numpy as np import json +import resemblyzer from tqdm import tqdm from sklearn.preprocessing import StandardScaler from utils.io import save_feature, save_txt, save_torch_audio @@ -262,6 +263,12 @@ def extract_utt_acoustic_features_tts(dataset_output, cfg, utt): wav = wav[start:end].astype(np.float32) wav_torch = torch.from_numpy(wav).to(wav_torch.device) + if cfg.preprocess.extract_speaker: + voice_encoder = resemblyzer.VoiceEncoder(verbose=False) + speaker_wav = resemblyzer.preprocess_wav(wav_path) + speaker_embedding = voice_encoder.embed_utterance(speaker_wav) + save_feature(dataset_output, cfg.preprocess.speaker_dir, uid, speaker_embedding) + if cfg.preprocess.extract_linear_spec: from utils.mel import extract_linear_features From e6d41ef17527aef964408f073f9dd0216737a316 Mon Sep 17 00:00:00 2001 From: mingyang Date: Fri, 5 Jan 2024 10:42:15 +0800 Subject: [PATCH 02/12] Add TransformerVC implementation --- bins/vc/inference.py | 257 ++++++++++ bins/vc/preprocess.py | 185 ++++++++ bins/vc/train.py | 104 +++++ egs/vc/README.md | 34 ++ egs/vc/TransformerVC/README.md | 164 +++++++ egs/vc/TransformerVC/exp_config.json | 109 +++++ egs/vc/TransformerVC/run.sh | 1 + egs/vc/_template/run.sh | 150 ++++++ models/vc/__init__.py | 0 models/vc/base/__init__.py | 7 + models/vc/base/vc_dataset.py | 441 ++++++++++++++++++ models/vc/base/vc_inference.py | 15 + models/vc/base/vc_trainer.py | 111 +++++ models/vc/transformer/__init__.py | 0 models/vc/transformer/conformer.py | 405 ++++++++++++++++ models/vc/transformer/transformer.py | 82 ++++ .../vc/transformer/transformer_inference.py | 45 ++ models/vc/transformer/transformer_trainer.py | 52 +++ 18 files changed, 2162 insertions(+) create mode 100644 bins/vc/inference.py create mode 100644 bins/vc/preprocess.py create mode 100644 bins/vc/train.py create mode 100755 egs/vc/README.md create mode 100644 egs/vc/TransformerVC/README.md create mode 100644 egs/vc/TransformerVC/exp_config.json create mode 120000 egs/vc/TransformerVC/run.sh create mode 100644 egs/vc/_template/run.sh create mode 100644 models/vc/__init__.py create mode 100644 models/vc/base/__init__.py create mode 100644 models/vc/base/vc_dataset.py create mode 100644 models/vc/base/vc_inference.py create mode 100644 models/vc/base/vc_trainer.py create mode 100644 models/vc/transformer/__init__.py create mode 100644 models/vc/transformer/conformer.py create mode 100644 models/vc/transformer/transformer.py create mode 100644 models/vc/transformer/transformer_inference.py create mode 100644 models/vc/transformer/transformer_trainer.py diff --git a/bins/vc/inference.py b/bins/vc/inference.py new file mode 100644 index 00000000..fce97e38 --- /dev/null +++ b/bins/vc/inference.py @@ -0,0 +1,257 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os +import glob +from tqdm import tqdm +import json +import torch +import time + + +from models.vc.transformer.transformer_inference import TransformerInference +from utils.util import load_config +from utils.audio_slicer import split_audio, merge_segments_encodec +from processors import acoustic_extractor, content_extractor + + +def build_inference(args, cfg, infer_type="from_dataset"): + supported_inference = { + "TransformerVC": TransformerInference, + } + + inference_class = supported_inference[cfg.model_type] + return inference_class(args, cfg, infer_type) + + +def prepare_for_audio_file(args, cfg, num_workers=1): + preprocess_path = cfg.preprocess.processed_dir + audio_name = cfg.inference.source_audio_name + temp_audio_dir = os.path.join(preprocess_path, audio_name) + + ### eval file + t = time.time() + eval_file = prepare_source_eval_file(cfg, temp_audio_dir, audio_name) + args.source = eval_file + with open(eval_file, "r") as f: + metadata = json.load(f) + print("Prepare for meta eval data: {:.1f}s".format(time.time() - t)) + + ### acoustic features + t = time.time() + acoustic_extractor.extract_utt_acoustic_features_serial( + metadata, temp_audio_dir, cfg + ) + if cfg.preprocess.use_min_max_norm_mel == True: + acoustic_extractor.cal_mel_min_max( + dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata + ) + print("Prepare for acoustic features: {:.1f}s".format(time.time() - t)) + + ### content features + t = time.time() + content_extractor.extract_utt_content_features_dataloader( + cfg, metadata, num_workers + ) + print("Prepare for content features: {:.1f}s".format(time.time() - t)) + return args, cfg, temp_audio_dir + + +def merge_for_audio_segments(audio_files, args, cfg): + audio_name = cfg.inference.source_audio_name + target_singer_name = args.target_singer + + merge_segments_encodec( + wav_files=audio_files, + fs=cfg.preprocess.sample_rate, + output_path=os.path.join( + args.output_dir, "{}_{}.wav".format(audio_name, target_singer_name) + ), + overlap_duration=cfg.inference.segments_overlap_duration, + ) + + for tmp_file in audio_files: + os.remove(tmp_file) + + +def prepare_source_eval_file(cfg, temp_audio_dir, audio_name): + """ + Prepare the eval file (json) for an audio + """ + + audio_chunks_results = split_audio( + wav_file=cfg.inference.source_audio_path, + target_sr=cfg.preprocess.sample_rate, + output_dir=os.path.join(temp_audio_dir, "wavs"), + max_duration_of_segment=cfg.inference.segments_max_duration, + overlap_duration=cfg.inference.segments_overlap_duration, + ) + + metadata = [] + for i, res in enumerate(audio_chunks_results): + res["index"] = i + res["Dataset"] = audio_name + res["Singer"] = audio_name + res["Uid"] = "{}_{}".format(audio_name, res["Uid"]) + metadata.append(res) + + eval_file = os.path.join(temp_audio_dir, "eval.json") + with open(eval_file, "w") as f: + json.dump(metadata, f, indent=4, ensure_ascii=False, sort_keys=True) + + return eval_file + + +def cuda_relevant(deterministic=False): + torch.cuda.empty_cache() + # TF32 on Ampere and above + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.enabled = True + torch.backends.cudnn.allow_tf32 = True + # Deterministic + torch.backends.cudnn.deterministic = deterministic + torch.backends.cudnn.benchmark = not deterministic + torch.use_deterministic_algorithms(deterministic) + + +def infer(args, cfg, infer_type): + # Build inference + t = time.time() + trainer = build_inference(args, cfg, infer_type) + print("Model Init: {:.1f}s".format(time.time() - t)) + + # Run inference + t = time.time() + output_audio_files = trainer.inference() + print("Model inference: {:.1f}s".format(time.time() - t)) + return output_audio_files + + +def build_parser(): + r"""Build argument parser for inference.py. + Anything else should be put in an extra config YAML file. + """ + + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", + type=str, + required=True, + help="JSON/YAML file for configurations.", + ) + parser.add_argument( + "--acoustics_dir", + type=str, + help="Acoustics model checkpoint directory. If a directory is given, " + "search for the latest checkpoint dir in the directory. If a specific " + "checkpoint dir is given, directly load the checkpoint.", + ) + parser.add_argument( + "--vocoder_dir", + type=str, + required=True, + help="Vocoder checkpoint directory. Searching behavior is the same as " + "the acoustics one.", + ) + parser.add_argument( + "--target", + type=str, + required=True, + help="Target audio file.", + ) + parser.add_argument( + "--trans_key", + default=0, + help="0: no pitch shift; autoshift: pitch shift; int: key shift.", + ) + parser.add_argument( + "--source", + type=str, + default="source_audio", + help="Source audio file or directory. If a JSON file is given, " + "inference from dataset is applied. If a directory is given, " + "inference from all wav/flac/mp3 audio files in the directory is applied. " + "Default: inference from all wav/flac/mp3 audio files in ./source_audio", + ) + parser.add_argument( + "--output_dir", + type=str, + default="conversion_results", + help="Output directory. Default: ./conversion_results", + ) + parser.add_argument( + "--log_level", + type=str, + default="warning", + help="Logging level. Default: warning", + ) + parser.add_argument( + "--keep_cache", + action="store_true", + default=True, + help="Keep cache files. Only applicable to inference from files.", + ) + parser.add_argument( + "--diffusion_inference_steps", + type=int, + default=50, + help="Number of inference steps. Only applicable to diffusion inference.", + ) + return parser + + +def main(): + ### Parse arguments and config + args = build_parser().parse_args() + cfg = load_config(args.config) + + # CUDA settings + cuda_relevant() + + if os.path.isdir(args.source): + ### Infer from file + + # Get all the source audio files (.wav, .flac, .mp3) + source_audio_dir = args.source + audio_list = [] + for suffix in ["wav", "flac", "mp3"]: + audio_list += glob.glob( + os.path.join(source_audio_dir, "**/*.{}".format(suffix)), recursive=True + ) + print("There are {} source audios: ".format(len(audio_list))) + + # Infer for every file as dataset + output_root_path = args.output_dir + for audio_path in tqdm(audio_list): + audio_name = audio_path.split("/")[-1].split(".")[0] + args.output_dir = os.path.join(output_root_path, audio_name) + print("\n{}\nConversion for {}...\n".format("*" * 10, audio_name)) + + cfg.inference.source_audio_path = audio_path + cfg.inference.source_audio_name = audio_name + cfg.inference.segments_max_duration = 10.0 + cfg.inference.segments_overlap_duration = 1.0 + + # Prepare metadata and features + args, cfg, cache_dir = prepare_for_audio_file(args, cfg) + + # Infer from file + output_audio_files = infer(args, cfg, infer_type="from_file") + + # Merge the split segments + merge_for_audio_segments(output_audio_files, args, cfg) + + # Keep or remove caches + if not args.keep_cache: + os.removedirs(cache_dir) + + else: + ### Infer from dataset + infer(args, cfg, infer_type="from_dataset") + + +if __name__ == "__main__": + main() diff --git a/bins/vc/preprocess.py b/bins/vc/preprocess.py new file mode 100644 index 00000000..8351eb9e --- /dev/null +++ b/bins/vc/preprocess.py @@ -0,0 +1,185 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import faulthandler + +faulthandler.enable() + +import os +import argparse +import json +from multiprocessing import cpu_count + + +from utils.util import load_config +from preprocessors.processor import preprocess_dataset +from preprocessors.metadata import cal_metadata +from processors import acoustic_extractor, content_extractor, data_augment + + +def extract_acoustic_features(dataset, output_path, cfg, n_workers=1): + """Extract acoustic features of utterances in the dataset + + Args: + dataset (str): name of dataset, e.g. opencpop + output_path (str): directory that stores train, test and feature files of datasets + cfg (dict): dictionary that stores configurations + n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1. + """ + types = ["train", "test", "valid"] if "eval" not in dataset else ["test"] + metadata = [] + dataset_output = os.path.join(output_path, dataset) + + for dataset_type in types: + dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type)) + with open(dataset_file, "r") as f: + metadata.extend(json.load(f)) + + # acoustic_extractor.extract_utt_acoustic_features_parallel( + # metadata, dataset_output, cfg, n_workers=n_workers + # ) + acoustic_extractor.extract_utt_acoustic_features_serial( + metadata, dataset_output, cfg + ) + + +def extract_content_features(dataset, output_path, cfg, num_workers=1): + """Extract content features of utterances in the dataset + + Args: + dataset (str): name of dataset, e.g. opencpop + output_path (str): directory that stores train, test and feature files of datasets + cfg (dict): dictionary that stores configurations + """ + types = ["train", "test", "valid"] if "eval" not in dataset else ["test"] + metadata = [] + for dataset_type in types: + dataset_output = os.path.join(output_path, dataset) + dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type)) + with open(dataset_file, "r") as f: + metadata.extend(json.load(f)) + + content_extractor.extract_utt_content_features_dataloader( + cfg, metadata, num_workers + ) + + +def preprocess(cfg, args): + """Proprocess raw data of single or multiple datasets (in cfg.dataset) + + Args: + cfg (dict): dictionary that stores configurations + args (ArgumentParser): specify the configuration file and num_workers + """ + # Specify the output root path to save the processed data + output_path = cfg.preprocess.processed_dir + os.makedirs(output_path, exist_ok=True) + # Prepare the content features + for dataset in cfg.dataset: + print("Extracting content features for {}...".format(dataset)) + extract_content_features(dataset, output_path, cfg, args.num_workers) + ## Split train and test sets + for dataset in cfg.dataset: + print("Preprocess {}...".format(dataset)) + preprocess_dataset( + dataset, + cfg.dataset_path[dataset], + output_path, + cfg.preprocess, + is_custom_dataset=cfg.use_custom_dataset, + ) + + # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch + try: + assert isinstance( + cfg.preprocess.data_augment, list + ), "Please provide a list of datasets need to be augmented." + if len(cfg.preprocess.data_augment) > 0: + new_datasets_list = [] + for dataset in cfg.preprocess.data_augment: + new_datasets = data_augment.augment_dataset(cfg, dataset) + new_datasets_list.extend(new_datasets) + cfg.dataset.extend(new_datasets_list) + print("Augmentation datasets: ", cfg.dataset) + except: + print("No Data Augmentation.") + + # Dump metadata of datasets (singers, train/test durations, etc.) + cal_metadata(cfg, dataset_types=["train", "test", "valid"]) + + ## Prepare the acoustic features + for dataset in cfg.dataset: + # Skip augmented datasets which do not need to extract acoustic features + # We will copy acoustic features from the original dataset later + if ( + "pitch_shift" in dataset + or "formant_shift" in dataset + or "equalizer" in dataset in dataset + ): + continue + print( + "Extracting acoustic features for {} using {} workers ...".format( + dataset, args.num_workers + ) + ) + extract_acoustic_features(dataset, output_path, cfg, args.num_workers) + # Calculate the statistics of acoustic features + if cfg.preprocess.mel_min_max_norm: + acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg) + + if cfg.preprocess.extract_pitch: + acoustic_extractor.cal_pitch_statistics_svc(dataset, output_path, cfg) + + # Copy acoustic features for augmented datasets by creating soft-links + for dataset in cfg.dataset: + if "pitch_shift" in dataset: + src_dataset = dataset.replace("_pitch_shift", "") + src_dataset_dir = os.path.join(output_path, src_dataset) + elif "formant_shift" in dataset: + src_dataset = dataset.replace("_formant_shift", "") + src_dataset_dir = os.path.join(output_path, src_dataset) + elif "equalizer" in dataset: + src_dataset = dataset.replace("_equalizer", "") + src_dataset_dir = os.path.join(output_path, src_dataset) + else: + continue + dataset_dir = os.path.join(output_path, dataset) + metadata = [] + for split in ["train", "test"] if not "eval" in dataset else ["test"]: + metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split)) + with open(metadata_file_path, "r") as f: + metadata.extend(json.load(f)) + print("Copying acoustic features for {}...".format(dataset)) + acoustic_extractor.copy_acoustic_features( + metadata, dataset_dir, src_dataset_dir, cfg + ) + if cfg.preprocess.mel_min_max_norm: + acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg) + + if cfg.preprocess.extract_pitch: + acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg) + + # Prepare the content features + for dataset in cfg.dataset: + print("Extracting content features for {}...".format(dataset)) + extract_content_features(dataset, output_path, cfg, args.num_workers) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", default="config.json", help="json files for configurations." + ) + parser.add_argument("--num_workers", type=int, default=int(cpu_count())) + parser.add_argument("--prepare_alignment", type=bool, default=False) + + args = parser.parse_args() + cfg = load_config(args.config) + + preprocess(cfg, args) + + +if __name__ == "__main__": + main() diff --git a/bins/vc/train.py b/bins/vc/train.py new file mode 100644 index 00000000..ac0be689 --- /dev/null +++ b/bins/vc/train.py @@ -0,0 +1,104 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse + +import torch + + +from models.vc.transformer.transformer_trainer import TransformerTrainer +from utils.util import load_config + + +def build_trainer(args, cfg): + supported_trainer = { + "TransformerVC": TransformerTrainer, + } + + trainer_class = supported_trainer[cfg.model_type] + trainer = trainer_class(args, cfg) + return trainer + + +def cuda_relevant(deterministic=False): + torch.cuda.empty_cache() + # TF32 on Ampere and above + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.enabled = True + torch.backends.cudnn.allow_tf32 = True + # Deterministic + torch.backends.cudnn.deterministic = deterministic + torch.backends.cudnn.benchmark = not deterministic + torch.use_deterministic_algorithms(deterministic) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", + default="config.json", + help="json files for configurations.", + required=True, + ) + parser.add_argument( + "--exp_name", + type=str, + default="exp_name", + help="A specific name to note the experiment", + required=True, + ) + parser.add_argument( + "--resume", + action="store_true", + help="If specified, to resume from the existing checkpoint.", + ) + parser.add_argument( + "--resume_from_ckpt_path", + type=str, + default="", + help="The specific checkpoint path that you want to resume from.", + ) + parser.add_argument( + "--resume_type", + type=str, + default="", + help="`resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights", + ) + + parser.add_argument( + "--log_level", default="warning", help="logging level (debug, info, warning)" + ) + args = parser.parse_args() + cfg = load_config(args.config) + + # Data Augmentation + if ( + type(cfg.preprocess.data_augment) == list + and len(cfg.preprocess.data_augment) > 0 + ): + new_datasets_list = [] + for dataset in cfg.preprocess.data_augment: + new_datasets = [ + f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None, + f"{dataset}_formant_shift" + if cfg.preprocess.use_formant_shift + else None, + f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None, + f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None, + ] + new_datasets_list.extend(filter(None, new_datasets)) + cfg.dataset.extend(new_datasets_list) + + # CUDA settings + cuda_relevant() + + # Build trainer + trainer = build_trainer(args, cfg) + + trainer.train_loop() + + +if __name__ == "__main__": + main() diff --git a/egs/vc/README.md b/egs/vc/README.md new file mode 100755 index 00000000..3207ecd7 --- /dev/null +++ b/egs/vc/README.md @@ -0,0 +1,34 @@ +# Amphion Singing Voice Conversion (SVC) Recipe + +## Quick Start + +We provide a **[beginner recipe](MultipleContentsSVC)** to demonstrate how to train a cutting edge SVC model. Specifically, it is also an official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Some demos can be seen [here](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html). + +## Supported Model Architectures + +The main idea of SVC is to first disentangle the speaker-agnostic representations from the source audio, and then inject the desired speaker information to synthesize the target, which usually utilizes an acoustic decoder and a subsequent waveform synthesizer (vocoder): + +
+
+ +
+
+ +Until now, Amphion SVC has supported the following features and models: + +- **Speaker-agnostic Representations**: + - Content Features: Sourcing from [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), and [ContentVec](https://github.com/auspicious3000/contentvec). + - Prosody Features: F0 and energy. +- **Speaker Embeddings**: + - Speaker Look-Up Table. + - Reference Encoder (👨‍💻 developing): It can be used for zero-shot SVC. +- **Acoustic Decoders**: + - Diffusion-based models: + - **[DiffWaveNetSVC](MultipleContentsSVC)**: The encoder is based on Bidirectional Non-Causal Dilated CNN, which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219). + - **[DiffComoSVC](DiffComoSVC)** (👨‍💻 developing): The diffusion framework is based on [Consistency Model](https://proceedings.mlr.press/v202/song23a.html). It can significantly accelerate the inference process of the diffusion model. + - Transformer-based models: + - **[TransformerSVC](TransformerSVC)**: Encoder-only and Non-autoregressive Transformer Architecture. + - VAE- and Flow-based models: + - **[VitsSVC](VitsSVC)**: It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc). +- **Waveform Synthesizers (Vocoders)**: + - The supported vocoders can be seen in [Amphion Vocoder Recipe](../vocoder/README.md). diff --git a/egs/vc/TransformerVC/README.md b/egs/vc/TransformerVC/README.md new file mode 100644 index 00000000..1797e32f --- /dev/null +++ b/egs/vc/TransformerVC/README.md @@ -0,0 +1,164 @@ +# Transformer for Singing Voice Conversion + +This is an implementation of **vanilla transformer encoder**/**conformer** as acoustic model for singing voice conversion. + +There are four stages in total: + +1. Data preparation +2. Features extraction +3. Training +4. Inference/conversion + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +### Dataset Download + +By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md). + +### Configuration + +Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json + "dataset": [ + "m4singer", + "opencpop", + "opensinger", + "svcc", + "vctk" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "m4singer": "[M4Singer dataset path]", + "opencpop": "[Opencpop dataset path]", + "opensinger": "[OpenSinger dataset path]", + "svcc": "[SVCC dataset path]", + "vctk": "[VCTK dataset path]" + }, +``` + +## 2. Features Extraction + +### Content-based Pretrained Models Download + +By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md). + +### Configuration + +Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`: + +```json + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "ckpts/svc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`). + +```bash +sh egs/svc/TransformerSVC/run.sh --stage 1 +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + +## 3. Training + +### Configuration +Specify the detailed configuration for transformer block in `exp_config.json`. For key `type`, `conformer` and `transformer` are supported: +```json +"model": { + ... + "transformer":{ + // 'conformer' or 'transformer' + "type": "conformer", + "input_dim": 384, + "output_dim": 100, + "n_heads": 2, + "n_layers": 6, + "filter_channels":512, + "dropout":0.1, + } + } +``` +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines. + +```json +"train": { + "batch_size": 32, + ... + "adamw": { + "lr": 2.0e-4 + }, + ... + } +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`. + +```bash +sh egs/svc/TransformerSVC/run.sh --stage 2 --name [YourExptName] +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + +## 4. Inference/Conversion + +### Pretrained Vocoder Download + +We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`). + +### Run + +For inference/conversion, you need to specify the following configurations when running `run.sh`: + +| Parameters | Description | Example | +| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` | +| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` | +| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). | +| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. | +| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. | + +For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run: + +```bash +cd Amphion +sh egs/svc/TransformerSVC/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir Amphion/ckpts/svc/[YourExptName] \ + --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \ + --infer_source_audio_dir [Your Audios Folder] \ + --infer_target_speaker "opencpop_female1" \ + --infer_key_shift "autoshift" +``` + +## Citations + +```bibtex +@inproceedings{transformer, + author = {Ashish Vaswani and + Noam Shazeer and + Niki Parmar and + Jakob Uszkoreit and + Llion Jones and + Aidan N. Gomez and + Lukasz Kaiser and + Illia Polosukhin}, + title = {Attention is All you Need}, + booktitle = {{NIPS}}, + pages = {5998--6008}, + year = {2017} +} +``` \ No newline at end of file diff --git a/egs/vc/TransformerVC/exp_config.json b/egs/vc/TransformerVC/exp_config.json new file mode 100644 index 00000000..4c8e62af --- /dev/null +++ b/egs/vc/TransformerVC/exp_config.json @@ -0,0 +1,109 @@ +{ + "base_config": "config/transformer.json", + "model_type": "TransformerVC", + "dataset": [ + "libritts", + ], + "dataset_path": { + // TODO: Fill in your dataset path + "libritts": "/home/mingyang/LibriTTS/LibriTTS", + }, + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "ckpts/vc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + // Config for features extraction + "extract_mel": true, + "extract_pitch": false, + "extract_energy": false, + "extract_speaker": true, + "extract_whisper_feature": true, + "extract_contentvec_feature": false, + "extract_wenet_feature": false, + "speaker_dir": "speaker", + "whisper_batch_size": 30, // decrease it if your GPU is out of memory + "contentvec_batch_size": 1, + // Fill in the content-based pretrained model's path + "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", + "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", + "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", + "whisper_model": "medium", + "whisper_model_path": "pretrained/whisper/medium.pt", + // Config for features usage + "use_mel": true, + "use_min_max_norm_mel": true, + "use_frame_pitch": false, + "use_frame_energy": false, + "use_spkid": false, + "use_spkemb": true, + "use_whisper": true, + "use_contentvec": false, + "use_wenet": false, + "n_mel": 100, + "sample_rate": 24000 + }, + "model": { + "condition_encoder": { + // Config for features usage + "use_whisper": true, + "use_contentvec": false, + "use_wenet": false, + "spkemb_dim": 256, + "whisper_dim": 1024, + "contentvec_dim": 256, + "wenet_dim": 512, + "use_singer_encoder": false, + "pitch_min": 50, + "pitch_max": 1100, + "use_spkemb": true, + "use_spkid": false + }, + "transformer": { + // 'conformer' or 'transformer' + "type": "conformer", + "input_dim": 384, + "output_dim": 100, + "n_heads": 2, + "n_layers": 6, + "filter_channels": 512, + "dropout": 0.1, + } + }, + "train": { + "batch_size": 128, + "gradient_accumulation_step": 1, + "max_epoch": -1, // -1 means no limit + "save_checkpoint_stride": [ + 50, + 50 + ], + "keep_last": [ + 5, + -1 + ], + "run_eval": [ + false, + true + ], + "adamw": { + "lr": 4.0e-4 + }, + "reducelronplateau": { + "factor": 0.8, + "patience": 10, + "min_lr": 1.0e-4 + }, + "dataloader": { + "num_worker": 8, + "pin_memory": true + }, + "sampler": { + "holistic_shuffle": false, + "drop_last": true + } + }, + "inference": { + + } +} \ No newline at end of file diff --git a/egs/vc/TransformerVC/run.sh b/egs/vc/TransformerVC/run.sh new file mode 120000 index 00000000..f8daac3d --- /dev/null +++ b/egs/vc/TransformerVC/run.sh @@ -0,0 +1 @@ +../_template/run.sh \ No newline at end of file diff --git a/egs/vc/_template/run.sh b/egs/vc/_template/run.sh new file mode 100644 index 00000000..514534cd --- /dev/null +++ b/egs/vc/_template/run.sh @@ -0,0 +1,150 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] Resume configuration + --resume) shift; resume=$1 ; shift ;; + # [Only for Training] The specific checkpoint path that you want to resume from. + --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac). + --infer_source_file) shift; infer_source_file=$1 ; shift ;; + --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;; + # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1". + --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;; + # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift". + --infer_key_shift) shift; infer_key_shift=$1 ; shift ;; + # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders. + --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vc/preprocess.py \ + --config $exp_config \ + --num_workers 4 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + if [ "$resume" = true ]; then + echo "Automatically resume from the experimental dir..." + CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vc/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --resume + else + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vc/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --resume_from_ckpt_path "$resume_from_ckpt_path" \ + --resume_type "$resume_type" + fi +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$expt_dir/result" + fi + + if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then + echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)." + exit 1 + fi + + if [ -z "$infer_source_file" ]; then + infer_source=$infer_source_audio_dir + fi + + if [ -z "$infer_source_audio_dir" ]; then + infer_source=$infer_source_file + fi + + if [ -z "$infer_target_speaker" ]; then + echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1"" + exit 1 + fi + + if [ -z "$infer_key_shift" ]; then + infer_key_shift="autoshift" + fi + + if [ -z "$infer_vocoder_dir" ]; then + infer_vocoder_dir="$work_dir"/pretrained/bigvgan + echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint." + fi + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \ + --config $exp_config \ + --acoustics_dir $infer_expt_dir \ + --vocoder_dir $infer_vocoder_dir \ + --target_singer $infer_target_speaker \ + --trans_key $infer_key_shift \ + --source $infer_source \ + --output_dir $infer_output_dir \ + --log_level debug +fi \ No newline at end of file diff --git a/models/vc/__init__.py b/models/vc/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/models/vc/base/__init__.py b/models/vc/base/__init__.py new file mode 100644 index 00000000..e19ec0dd --- /dev/null +++ b/models/vc/base/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .vc_inference import VCInference +from .vc_trainer import VCTrainer diff --git a/models/vc/base/vc_dataset.py b/models/vc/base/vc_dataset.py new file mode 100644 index 00000000..ca627d0c --- /dev/null +++ b/models/vc/base/vc_dataset.py @@ -0,0 +1,441 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import random +import torch +from torch.nn.utils.rnn import pad_sequence +import json +import os +import numpy as np +import resemblyzer +from utils.data_utils import * +from processors.acoustic_extractor import cal_normalized_mel, load_mel_extrema +from processors.content_extractor import ( + ContentvecExtractor, + WhisperExtractor, + WenetExtractor, +) +from models.base.base_dataset import ( + BaseCollator, + BaseDataset, +) +from models.base.new_dataset import BaseTestDataset + +EPS = 1.0e-12 + + +class VCDataset(BaseDataset): + def __init__(self, cfg, dataset, is_valid=False): + BaseDataset.__init__(self, cfg, dataset, is_valid=is_valid) + + cfg = self.cfg + + if cfg.model.condition_encoder.use_whisper: + self.whisper_aligner = WhisperExtractor(self.cfg) + self.utt2whisper_path = load_content_feature_path( + self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.whisper_dir + ) + + if cfg.model.condition_encoder.use_contentvec: + self.contentvec_aligner = ContentvecExtractor(self.cfg) + self.utt2contentVec_path = load_content_feature_path( + self.metadata, + cfg.preprocess.processed_dir, + cfg.preprocess.contentvec_dir, + ) + + if cfg.model.condition_encoder.use_mert: + self.utt2mert_path = load_content_feature_path( + self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.mert_dir + ) + if cfg.model.condition_encoder.use_wenet: + self.wenet_aligner = WenetExtractor(self.cfg) + self.utt2wenet_path = load_content_feature_path( + self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.wenet_dir + ) + + + def __getitem__(self, index): + single_feature = BaseDataset.__getitem__(self, index) + + utt_info = self.metadata[index] + dataset = utt_info["Dataset"] + uid = utt_info["Uid"] + utt = "{}_{}".format(dataset, uid) + + if self.cfg.model.condition_encoder.use_whisper: + assert "target_len" in single_feature.keys() + aligned_whisper_feat = self.whisper_aligner.offline_align( + np.load(self.utt2whisper_path[utt]), single_feature["target_len"] + ) + single_feature["whisper_feat"] = aligned_whisper_feat + + if self.cfg.model.condition_encoder.use_contentvec: + assert "target_len" in single_feature.keys() + aligned_contentvec = self.contentvec_aligner.offline_align( + np.load(self.utt2contentVec_path[utt]), single_feature["target_len"] + ) + single_feature["contentvec_feat"] = aligned_contentvec + + if self.cfg.model.condition_encoder.use_mert: + assert "target_len" in single_feature.keys() + aligned_mert_feat = align_content_feature_length( + np.load(self.utt2mert_path[utt]), + single_feature["target_len"], + source_hop=self.cfg.preprocess.mert_hop_size, + ) + single_feature["mert_feat"] = aligned_mert_feat + + if self.cfg.model.condition_encoder.use_wenet: + assert "target_len" in single_feature.keys() + aligned_wenet_feat = self.wenet_aligner.offline_align( + np.load(self.utt2wenet_path[utt]), single_feature["target_len"] + ) + single_feature["wenet_feat"] = aligned_wenet_feat + + # print(single_feature.keys()) + # for k, v in single_feature.items(): + # if type(v) in [torch.Tensor, np.ndarray]: + # print(k, v.shape) + # else: + # print(k, v) + # exit() + + return self.clip_if_too_long(single_feature) + + def __len__(self): + return len(self.metadata) + + def random_select(self, feature_seq_len, max_seq_len, ending_ts=2812): + """ + ending_ts: to avoid invalid whisper features for over 30s audios + 2812 = 30 * 24000 // 256 + """ + ts = max(feature_seq_len - max_seq_len, 0) + ts = min(ts, ending_ts - max_seq_len) + + start = random.randint(0, ts) + end = start + max_seq_len + return start, end + + def clip_if_too_long(self, sample, max_seq_len=512): + """ + sample : + { + 'spk_id': (1,), + 'target_len': int + 'mel': (seq_len, dim), + 'frame_pitch': (seq_len,) + 'frame_energy': (seq_len,) + 'content_vector_feat': (seq_len, dim) + } + """ + + if sample["target_len"] <= max_seq_len: + return sample + + start, end = self.random_select(sample["target_len"], max_seq_len) + sample["target_len"] = end - start + + for k in sample.keys(): + if k == "audio": + # audio should be clipped in hop_size scale + sample[k] = sample[k][ + start + * self.cfg.preprocess.hop_size : end + * self.cfg.preprocess.hop_size + ] + elif k == "audio_len": + sample[k] = (end - start) * self.cfg.preprocess.hop_size + elif k not in ["spk_id", "target_len", "spkemb"]: + sample[k] = sample[k][start:end] + + return sample + + +class VCCollator(BaseCollator): + """Zero-pads model inputs and targets based on number of frames per step""" + + def __init__(self, cfg): + BaseCollator.__init__(self, cfg) + + def __call__(self, batch): + parsed_batch_features = BaseCollator.__call__(self, batch) + return parsed_batch_features + + +class VCTestDataset(BaseTestDataset): + def __init__(self, args, cfg, infer_type): + BaseTestDataset.__init__(self, args, cfg, infer_type) + self.metadata = self.get_metadata() + + self.target = args.target + self.cfg = cfg + self.trans_key = args.trans_key + + + self.target_dataset = cfg.dataset[0] + if cfg.preprocess.mel_min_max_norm: + self.target_mel_extrema = load_mel_extrema( + cfg.preprocess, self.target_dataset + ) + self.target_mel_extrema = torch.as_tensor( + self.target_mel_extrema[0] + ), torch.as_tensor(self.target_mel_extrema[1]) + + ######### Load source acoustic features ######### + if cfg.preprocess.use_spkid: + spk2id_path = os.path.join(args.acoustics_dir, cfg.preprocess.spk2id) + # utt2sp_path = os.path.join(self.data_root, cfg.preprocess.utt2spk) + + with open(spk2id_path, "r") as f: + self.spk2id = json.load(f) + # print("self.spk2id", self.spk2id) + + if cfg.preprocess.use_uv: + self.utt2uv_path = { + f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join( + cfg.preprocess.processed_dir, + utt_info["Dataset"], + cfg.preprocess.uv_dir, + utt_info["Uid"] + ".npy", + ) + for utt_info in self.metadata + } + + if cfg.preprocess.use_frame_pitch: + self.utt2frame_pitch_path = { + f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join( + cfg.preprocess.processed_dir, + utt_info["Dataset"], + cfg.preprocess.pitch_dir, + utt_info["Uid"] + ".npy", + ) + for utt_info in self.metadata + } + + # Target F0 median + target_f0_statistics_path = os.path.join( + cfg.preprocess.processed_dir, + self.target_dataset, + cfg.preprocess.pitch_dir, + "statistics.json", + ) + self.target_pitch_median = json.load(open(target_f0_statistics_path, "r"))[ + f"{self.target_dataset}_{self.target_singer}" + ]["voiced_positions"]["median"] + + # Source F0 median (if infer from file) + if infer_type == "from_file": + source_audio_name = cfg.inference.source_audio_name + source_f0_statistics_path = os.path.join( + cfg.preprocess.processed_dir, + source_audio_name, + cfg.preprocess.pitch_dir, + "statistics.json", + ) + self.source_pitch_median = json.load( + open(source_f0_statistics_path, "r") + )[f"{source_audio_name}_{source_audio_name}"]["voiced_positions"][ + "median" + ] + else: + self.source_pitch_median = None + + if cfg.preprocess.use_frame_energy: + self.utt2frame_energy_path = { + f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join( + cfg.preprocess.processed_dir, + utt_info["Dataset"], + cfg.preprocess.energy_dir, + utt_info["Uid"] + ".npy", + ) + for utt_info in self.metadata + } + + if cfg.preprocess.use_mel: + self.utt2mel_path = { + f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join( + cfg.preprocess.processed_dir, + utt_info["Dataset"], + cfg.preprocess.mel_dir, + utt_info["Uid"] + ".npy", + ) + for utt_info in self.metadata + } + + ######### Load source content features' path ######### + if cfg.model.condition_encoder.use_whisper: + self.whisper_aligner = WhisperExtractor(cfg) + self.utt2whisper_path = load_content_feature_path( + self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.whisper_dir + ) + + if cfg.model.condition_encoder.use_contentvec: + self.contentvec_aligner = ContentvecExtractor(cfg) + self.utt2contentVec_path = load_content_feature_path( + self.metadata, + cfg.preprocess.processed_dir, + cfg.preprocess.contentvec_dir, + ) + + if cfg.model.condition_encoder.use_mert: + self.utt2mert_path = load_content_feature_path( + self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.mert_dir + ) + if cfg.model.condition_encoder.use_wenet: + self.wenet_aligner = WenetExtractor(cfg) + self.utt2wenet_path = load_content_feature_path( + self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.wenet_dir + ) + + def __getitem__(self, index): + single_feature = {} + + utt_info = self.metadata[index] + dataset = utt_info["Dataset"] + uid = utt_info["Uid"] + utt = "{}_{}".format(dataset, uid) + + source_dataset = self.metadata[index]["Dataset"] + + if self.cfg.preprocess.use_spkid: + single_feature["spk_id"] = np.array( + [self.spk2id[f"{self.target_dataset}_{self.target_singer}"]], + dtype=np.int32, + ) + + if self.cfg.preprocess.use_spkemb: + voice_encoder = resemblyzer.VoiceEncoder('cpu', verbose=False) + target_wav = resemblyzer.preprocess_wav(self.target) + single_feature["spkemb"] = voice_encoder.embed_utterance(target_wav) + + ######### Get Acoustic Features Item ######### + if self.cfg.preprocess.use_mel: + mel = np.load(self.utt2mel_path[utt]) + assert mel.shape[0] == self.cfg.preprocess.n_mel # [n_mels, T] + if self.cfg.preprocess.use_min_max_norm_mel: + # mel norm + mel = cal_normalized_mel(mel, source_dataset, self.cfg.preprocess) + + if "target_len" not in single_feature.keys(): + single_feature["target_len"] = mel.shape[1] + single_feature["mel"] = mel.T # [T, n_mels] + + if self.cfg.preprocess.use_frame_pitch: + frame_pitch_path = self.utt2frame_pitch_path[utt] + frame_pitch = np.load(frame_pitch_path) + + if self.trans_key: + try: + self.trans_key = int(self.trans_key) + except: + pass + if type(self.trans_key) == int: + frame_pitch = transpose_key(frame_pitch, self.trans_key) + elif self.trans_key: + assert self.target_singer + + frame_pitch = pitch_shift_to_target( + frame_pitch, self.target_pitch_median, self.source_pitch_median + ) + + if "target_len" not in single_feature.keys(): + single_feature["target_len"] = len(frame_pitch) + aligned_frame_pitch = align_length( + frame_pitch, single_feature["target_len"] + ) + single_feature["frame_pitch"] = aligned_frame_pitch + + if self.cfg.preprocess.use_uv: + frame_uv_path = self.utt2uv_path[utt] + frame_uv = np.load(frame_uv_path) + aligned_frame_uv = align_length(frame_uv, single_feature["target_len"]) + aligned_frame_uv = [ + 0 if frame_uv else 1 for frame_uv in aligned_frame_uv + ] + aligned_frame_uv = np.array(aligned_frame_uv) + single_feature["frame_uv"] = aligned_frame_uv + + if self.cfg.preprocess.use_frame_energy: + frame_energy_path = self.utt2frame_energy_path[utt] + frame_energy = np.load(frame_energy_path) + if "target_len" not in single_feature.keys(): + single_feature["target_len"] = len(frame_energy) + aligned_frame_energy = align_length( + frame_energy, single_feature["target_len"] + ) + single_feature["frame_energy"] = aligned_frame_energy + + ######### Get Content Features Item ######### + if self.cfg.model.condition_encoder.use_whisper: + assert "target_len" in single_feature.keys() + aligned_whisper_feat = self.whisper_aligner.offline_align( + np.load(self.utt2whisper_path[utt]), single_feature["target_len"] + ) + single_feature["whisper_feat"] = aligned_whisper_feat + + if self.cfg.model.condition_encoder.use_contentvec: + assert "target_len" in single_feature.keys() + aligned_contentvec = self.contentvec_aligner.offline_align( + np.load(self.utt2contentVec_path[utt]), single_feature["target_len"] + ) + single_feature["contentvec_feat"] = aligned_contentvec + + if self.cfg.model.condition_encoder.use_mert: + assert "target_len" in single_feature.keys() + aligned_mert_feat = align_content_feature_length( + np.load(self.utt2mert_path[utt]), + single_feature["target_len"], + source_hop=self.cfg.preprocess.mert_hop_size, + ) + single_feature["mert_feat"] = aligned_mert_feat + + if self.cfg.model.condition_encoder.use_wenet: + assert "target_len" in single_feature.keys() + aligned_wenet_feat = self.wenet_aligner.offline_align( + np.load(self.utt2wenet_path[utt]), single_feature["target_len"] + ) + single_feature["wenet_feat"] = aligned_wenet_feat + + return single_feature + + def __len__(self): + return len(self.metadata) + + +class VCTestCollator: + """Zero-pads model inputs and targets based on number of frames per step""" + + def __init__(self, cfg): + self.cfg = cfg + + def __call__(self, batch): + packed_batch_features = dict() + + # mel: [b, T, n_mels] + # frame_pitch, frame_energy: [1, T] + # target_len: [1] + # spk_id: [b, 1] + # mask: [b, T, 1] + + for key in batch[0].keys(): + if key == "target_len": + packed_batch_features["target_len"] = torch.LongTensor( + [b["target_len"] for b in batch] + ) + masks = [ + torch.ones((b["target_len"], 1), dtype=torch.long) for b in batch + ] + packed_batch_features["mask"] = pad_sequence( + masks, batch_first=True, padding_value=0 + ) + else: + values = [torch.from_numpy(b[key]) for b in batch] + packed_batch_features[key] = pad_sequence( + values, batch_first=True, padding_value=0 + ) + + return packed_batch_features diff --git a/models/vc/base/vc_inference.py b/models/vc/base/vc_inference.py new file mode 100644 index 00000000..a97b36ed --- /dev/null +++ b/models/vc/base/vc_inference.py @@ -0,0 +1,15 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from models.base.new_inference import BaseInference +from models.vc.base.vc_dataset import VCTestCollator, VCTestDataset + + +class VCInference(BaseInference): + def __init__(self, args=None, cfg=None, infer_type="from_dataset"): + BaseInference.__init__(self, args, cfg, infer_type) + + def _build_test_dataset(self): + return VCTestDataset, VCTestCollator diff --git a/models/vc/base/vc_trainer.py b/models/vc/base/vc_trainer.py new file mode 100644 index 00000000..48800284 --- /dev/null +++ b/models/vc/base/vc_trainer.py @@ -0,0 +1,111 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import json +import os + +import torch +import torch.nn as nn + +from models.base.new_trainer import BaseTrainer +from models.vc.base.vc_dataset import VCCollator, VCDataset + + +class VCTrainer(BaseTrainer): + r"""The base trainer for all SVC models. It inherits from BaseTrainer and implements + ``build_criterion``, ``_build_dataset`` and ``_build_singer_lut`` methods. You can inherit from this + class, and implement ``_build_model``, ``_forward_step``. + """ + + def __init__(self, args=None, cfg=None): + self.args = args + self.cfg = cfg + + self._init_accelerator() + + # Only for SVC tasks + with self.accelerator.main_process_first(): + self.singers = self._build_singer_lut() + + # Super init + BaseTrainer.__init__(self, args, cfg) + + # Only for SVC tasks + self.task_type = "SVC" + self.logger.info("Task type: {}".format(self.task_type)) + + ### Following are methods only for SVC tasks ### + # TODO: LEGACY CODE, NEED TO BE REFACTORED + def _build_dataset(self): + return VCDataset, VCCollator + + @staticmethod + def _build_criterion(): + criterion = nn.MSELoss(reduction="none") + return criterion + + @staticmethod + def _compute_loss(criterion, y_pred, y_gt, loss_mask): + """ + Args: + criterion: MSELoss(reduction='none') + y_pred, y_gt: (bs, seq_len, D) + loss_mask: (bs, seq_len, 1) + Returns: + loss: Tensor of shape [] + """ + + # (bs, seq_len, D) + loss = criterion(y_pred, y_gt) + # expand loss_mask to (bs, seq_len, D) + loss_mask = loss_mask.repeat(1, 1, loss.shape[-1]) + + loss = torch.sum(loss * loss_mask) / torch.sum(loss_mask) + return loss + + def _save_auxiliary_states(self): + """ + To save the singer's look-up table in the checkpoint saving path + """ + with open( + os.path.join(self.tmp_checkpoint_save_path, self.cfg.preprocess.spk2id), "w" + ) as f: + json.dump(self.singers, f, indent=4, ensure_ascii=False) + + def _build_singer_lut(self): + resumed_singer_path = None + if self.args.resume_from_ckpt_path and self.args.resume_from_ckpt_path != "": + resumed_singer_path = os.path.join( + self.args.resume_from_ckpt_path, self.cfg.preprocess.spk2id + ) + if os.path.exists(os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)): + resumed_singer_path = os.path.join(self.exp_dir, self.cfg.preprocess.spk2id) + + if resumed_singer_path: + with open(resumed_singer_path, "r") as f: + singers = json.load(f) + else: + singers = dict() + + for dataset in self.cfg.dataset: + singer_lut_path = os.path.join( + self.cfg.preprocess.processed_dir, dataset, self.cfg.preprocess.spk2id + ) + with open(singer_lut_path, "r") as singer_lut_path: + singer_lut = json.load(singer_lut_path) + for singer in singer_lut.keys(): + if singer not in singers: + singers[singer] = len(singers) + + with open( + os.path.join(self.exp_dir, self.cfg.preprocess.spk2id), "w" + ) as singer_file: + json.dump(singers, singer_file, indent=4, ensure_ascii=False) + print( + "singers have been dumped to {}".format( + os.path.join(self.exp_dir, self.cfg.preprocess.spk2id) + ) + ) + return singers diff --git a/models/vc/transformer/__init__.py b/models/vc/transformer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/models/vc/transformer/conformer.py b/models/vc/transformer/conformer.py new file mode 100644 index 00000000..5e48019c --- /dev/null +++ b/models/vc/transformer/conformer.py @@ -0,0 +1,405 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +import torch +import numpy as np +import torch.nn as nn +from utils.util import convert_pad_shape + + +class BaseModule(torch.nn.Module): + def __init__(self): + super(BaseModule, self).__init__() + + @property + def nparams(self): + """ + Returns number of trainable parameters of the module. + """ + num_params = 0 + for name, param in self.named_parameters(): + if param.requires_grad: + num_params += np.prod(param.detach().cpu().numpy().shape) + return num_params + + def relocate_input(self, x: list): + """ + Relocates provided tensors to the same device set for the module. + """ + device = next(self.parameters()).device + for i in range(len(x)): + if isinstance(x[i], torch.Tensor) and x[i].device != device: + x[i] = x[i].to(device) + return x + + +class LayerNorm(BaseModule): + def __init__(self, channels, eps=1e-4): + super(LayerNorm, self).__init__() + self.channels = channels + self.eps = eps + + self.gamma = torch.nn.Parameter(torch.ones(channels)) + self.beta = torch.nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + n_dims = len(x.shape) + mean = torch.mean(x, 1, keepdim=True) + variance = torch.mean((x - mean) ** 2, 1, keepdim=True) + + x = (x - mean) * torch.rsqrt(variance + self.eps) + + shape = [1, -1] + [1] * (n_dims - 2) + x = x * self.gamma.view(*shape) + self.beta.view(*shape) + return x + + +class ConvReluNorm(BaseModule): + def __init__( + self, + in_channels, + hidden_channels, + out_channels, + kernel_size, + n_layers, + p_dropout, + eps=1e-5, + ): + super(ConvReluNorm, self).__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + self.eps = eps + + self.conv_layers = torch.nn.ModuleList() + self.conv_layers.append( + torch.nn.Conv1d( + in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 + ) + ) + self.relu_drop = torch.nn.Sequential( + torch.nn.ReLU(), torch.nn.Dropout(p_dropout) + ) + for _ in range(n_layers - 1): + self.conv_layers.append( + torch.nn.Conv1d( + hidden_channels, + hidden_channels, + kernel_size, + padding=kernel_size // 2, + ) + ) + self.proj = torch.nn.Conv1d(hidden_channels, out_channels, 1) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask): + for i in range(self.n_layers): + x = self.conv_layers[i](x * x_mask) + x = self.instance_norm(x, x_mask) + x = self.relu_drop(x) + x = self.proj(x) + return x * x_mask + + def instance_norm(self, x, mask, return_mean_std=False): + mean, std = self.calc_mean_std(x, mask) + x = (x - mean) / std + if return_mean_std: + return x, mean, std + else: + return x + + def calc_mean_std(self, x, mask=None): + x = x * mask + B, C = x.shape[:2] + mn = x.view(B, C, -1).mean(-1) + sd = (x.view(B, C, -1).var(-1) + self.eps).sqrt() + mn = mn.view(B, C, *((len(x.shape) - 2) * [1])) + sd = sd.view(B, C, *((len(x.shape) - 2) * [1])) + return mn, sd + + +class MultiHeadAttention(BaseModule): + def __init__( + self, + channels, + out_channels, + n_heads, + window_size=None, + heads_share=True, + p_dropout=0.0, + proximal_bias=False, + proximal_init=False, + ): + super(MultiHeadAttention, self).__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.window_size = window_size + self.heads_share = heads_share + self.proximal_bias = proximal_bias + self.p_dropout = p_dropout + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = torch.nn.Conv1d(channels, channels, 1) + self.conv_k = torch.nn.Conv1d(channels, channels, 1) + self.conv_v = torch.nn.Conv1d(channels, channels, 1) + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + self.emb_rel_k = torch.nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.emb_rel_v = torch.nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.conv_o = torch.nn.Conv1d(channels, out_channels, 1) + self.drop = torch.nn.Dropout(p_dropout) + + torch.nn.init.xavier_uniform_(self.conv_q.weight) + torch.nn.init.xavier_uniform_(self.conv_k.weight) + if proximal_init: + self.conv_k.weight.data.copy_(self.conv_q.weight.data) + self.conv_k.bias.data.copy_(self.conv_q.bias.data) + torch.nn.init.xavier_uniform_(self.conv_v.weight) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.k_channels) + if self.window_size is not None: + assert ( + t_s == t_t + ), "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys(query, key_relative_embeddings) + rel_logits = self._relative_position_to_absolute_position(rel_logits) + scores_local = rel_logits / math.sqrt(self.k_channels) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype + ) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + p_attn = torch.nn.functional.softmax(scores, dim=-1) + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s + ) + output = output + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings + ) + output = output.transpose(2, 3).contiguous().view(b, d, t_t) + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = torch.nn.functional.pad( + relative_embeddings, + convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + ) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[ + :, slice_start_position:slice_end_position + ] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + batch, heads, length, _ = x.size() + x = torch.nn.functional.pad( + x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]) + ) + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = torch.nn.functional.pad( + x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) + ) + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ + :, :, :length, length - 1 : + ] + return x_final + + def _absolute_position_to_relative_position(self, x): + batch, heads, length, _ = x.size() + x = torch.nn.functional.pad( + x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) + ) + x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) + x_flat = torch.nn.functional.pad( + x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]]) + ) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class FFN(BaseModule): + def __init__( + self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0.0 + ): + super(FFN, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + + self.conv_1 = torch.nn.Conv1d( + in_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.conv_2 = torch.nn.Conv1d( + filter_channels, out_channels, kernel_size, padding=kernel_size // 2 + ) + self.drop = torch.nn.Dropout(p_dropout) + + def forward(self, x, x_mask): + x = self.conv_1(x * x_mask) + x = torch.relu(x) + x = self.drop(x) + x = self.conv_2(x * x_mask) + return x * x_mask + + +class Encoder(BaseModule): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads=2, + n_layers=6, + kernel_size=3, + p_dropout=0.1, + window_size=4, + **kwargs + ): + super(Encoder, self).__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + + self.drop = torch.nn.Dropout(p_dropout) + self.attn_layers = torch.nn.ModuleList() + self.norm_layers_1 = torch.nn.ModuleList() + self.ffn_layers = torch.nn.ModuleList() + self.norm_layers_2 = torch.nn.ModuleList() + for _ in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + window_size=window_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + for i in range(self.n_layers): + x = x * x_mask + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class Conformer(BaseModule): + def __init__(self, cfg): + super().__init__() + self.cfg = cfg + self.n_heads = self.cfg.n_heads + self.n_layers = self.cfg.n_layers + self.hidden_channels = self.cfg.input_dim + self.filter_channels = self.cfg.filter_channels + self.output_dim = self.cfg.output_dim + self.dropout = self.cfg.dropout + + self.conformer_encoder = Encoder( + self.hidden_channels, + self.filter_channels, + n_heads=self.n_heads, + n_layers=self.n_layers, + kernel_size=3, + p_dropout=self.dropout, + window_size=4, + ) + self.projection = nn.Conv1d(self.hidden_channels, self.output_dim, 1) + + def forward(self, x, x_mask): + """ + Args: + x: (N, seq_len, input_dim) + Returns: + output: (N, seq_len, output_dim) + """ + # (N, seq_len, d_model) + x = x.transpose(1, 2) + x_mask = x_mask.transpose(1, 2) + output = self.conformer_encoder(x, x_mask) + # (N, seq_len, output_dim) + output = self.projection(output) + output = output.transpose(1, 2) + return output diff --git a/models/vc/transformer/transformer.py b/models/vc/transformer/transformer.py new file mode 100644 index 00000000..fd3cdb6c --- /dev/null +++ b/models/vc/transformer/transformer.py @@ -0,0 +1,82 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +import torch +import torch.nn as nn +from torch.nn import TransformerEncoder, TransformerEncoderLayer + + +class Transformer(nn.Module): + def __init__(self, cfg): + super().__init__() + self.cfg = cfg + + dropout = self.cfg.dropout + nhead = self.cfg.n_heads + nlayers = self.cfg.n_layers + input_dim = self.cfg.input_dim + output_dim = self.cfg.output_dim + + d_model = input_dim + self.pos_encoder = PositionalEncoding(d_model, dropout) + encoder_layers = TransformerEncoderLayer( + d_model, nhead, dropout=dropout, batch_first=True + ) + self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers) + + self.output_mlp = nn.Linear(d_model, output_dim) + + def forward(self, x, mask=None): + """ + Args: + x: (N, seq_len, input_dim) + Returns: + output: (N, seq_len, output_dim) + """ + # (N, seq_len, d_model) + src = self.pos_encoder(x) + # model_stats["pos_embedding"] = x + # (N, seq_len, d_model) + output = self.transformer_encoder(src) + # (N, seq_len, output_dim) + output = self.output_mlp(output) + return output + + +class PositionalEncoding(nn.Module): + def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000): + super().__init__() + self.dropout = nn.Dropout(p=dropout) + + position = torch.arange(max_len).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model) + ) + + # Assume that x is (seq_len, N, d) + # pe = torch.zeros(max_len, 1, d_model) + # pe[:, 0, 0::2] = torch.sin(position * div_term) + # pe[:, 0, 1::2] = torch.cos(position * div_term) + + # Assume that x in (N, seq_len, d) + pe = torch.zeros(1, max_len, d_model) + pe[0, :, 0::2] = torch.sin(position * div_term) + pe[0, :, 1::2] = torch.cos(position * div_term) + + self.register_buffer("pe", pe) + + def forward(self, x): + """ + Args: + x: Tensor, shape [N, seq_len, d] + """ + # Old: Assume that x is (seq_len, N, d), and self.pe is (max_len, 1, d_model) + # x = x + self.pe[: x.size(0)] + + # Now: self.pe is (1, max_len, d) + x = x + self.pe[:, : x.size(1), :] + + return self.dropout(x) diff --git a/models/vc/transformer/transformer_inference.py b/models/vc/transformer/transformer_inference.py new file mode 100644 index 00000000..f1c8f943 --- /dev/null +++ b/models/vc/transformer/transformer_inference.py @@ -0,0 +1,45 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import time +import numpy as np +import torch +from tqdm import tqdm +import torch.nn as nn +from collections import OrderedDict + +from models.vc.base import VCInference +from modules.encoder.condition_encoder import ConditionEncoder +from models.vc.transformer.transformer import Transformer +from models.vc.transformer.conformer import Conformer + + +class TransformerInference(VCInference): + def __init__(self, args=None, cfg=None, infer_type="from_dataset"): + VCInference.__init__(self, args, cfg, infer_type) + + def _build_model(self): + self.cfg.model.condition_encoder.f0_min = self.cfg.preprocess.f0_min + self.cfg.model.condition_encoder.f0_max = self.cfg.preprocess.f0_max + self.condition_encoder = ConditionEncoder(self.cfg.model.condition_encoder) + if self.cfg.model.transformer.type == "transformer": + self.acoustic_mapper = Transformer(self.cfg.model.transformer) + elif self.cfg.model.transformer.type == "conformer": + self.acoustic_mapper = Conformer(self.cfg.model.transformer) + else: + raise NotImplementedError + model = torch.nn.ModuleList([self.condition_encoder, self.acoustic_mapper]) + return model + + def _inference_each_batch(self, batch_data): + device = self.accelerator.device + for k, v in batch_data.items(): + batch_data[k] = v.to(device) + + condition = self.condition_encoder(batch_data) + y_pred = self.acoustic_mapper(condition, batch_data["mask"]) + + return y_pred diff --git a/models/vc/transformer/transformer_trainer.py b/models/vc/transformer/transformer_trainer.py new file mode 100644 index 00000000..24f90572 --- /dev/null +++ b/models/vc/transformer/transformer_trainer.py @@ -0,0 +1,52 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch + +from models.vc.base import VCTrainer +from modules.encoder.condition_encoder import ConditionEncoder +from models.vc.transformer.transformer import Transformer +from models.vc.transformer.conformer import Conformer +from utils.ssim import SSIM + + +class TransformerTrainer(VCTrainer): + def __init__(self, args, cfg): + VCTrainer.__init__(self, args, cfg) + self.ssim_loss = SSIM() + + def _build_model(self): + self.cfg.model.condition_encoder.f0_min = self.cfg.preprocess.f0_min + self.cfg.model.condition_encoder.f0_max = self.cfg.preprocess.f0_max + self.condition_encoder = ConditionEncoder(self.cfg.model.condition_encoder) + if self.cfg.model.transformer.type == "transformer": + self.acoustic_mapper = Transformer(self.cfg.model.transformer) + elif self.cfg.model.transformer.type == "conformer": + self.acoustic_mapper = Conformer(self.cfg.model.transformer) + else: + raise NotImplementedError + model = torch.nn.ModuleList([self.condition_encoder, self.acoustic_mapper]) + return model + + def _forward_step(self, batch): + total_loss = 0 + device = self.accelerator.device + mel = batch["mel"] + mask = batch["mask"] + + condition = self.condition_encoder(batch) + mel_pred = self.acoustic_mapper(condition, mask) + + l1_loss = torch.sum(torch.abs(mel_pred - mel) * batch["mask"]) / torch.sum( + batch["mask"] + ) + self._check_nan(l1_loss, mel_pred, mel) + total_loss += l1_loss + ssim_loss = self.ssim_loss(mel_pred, mel) + ssim_loss = torch.sum(ssim_loss * batch["mask"]) / torch.sum(batch["mask"]) + self._check_nan(ssim_loss, mel_pred, mel) + total_loss += ssim_loss + + return total_loss From 0787a6ab06a8286bde0e0667cd798f45720a8430 Mon Sep 17 00:00:00 2001 From: mingyang Date: Fri, 5 Jan 2024 17:39:41 +0800 Subject: [PATCH 03/12] Fixed load vocoder checkpoint issue --- .gitignore | 6 +++++- bins/vc/inference.py | 2 +- egs/vc/TransformerVC/exp_config.json | 2 +- egs/vc/_template/run.sh | 10 +++++----- models/base/new_inference.py | 7 +++++-- 5 files changed, 17 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 67f8d188..ab94a18b 100644 --- a/.gitignore +++ b/.gitignore @@ -61,4 +61,8 @@ logs source_audio result conversion_results -get_available_gpu.py \ No newline at end of file +get_available_gpu.py + +#slurm files +slurm.sh +*.log \ No newline at end of file diff --git a/bins/vc/inference.py b/bins/vc/inference.py index fce97e38..f5bf40ee 100644 --- a/bins/vc/inference.py +++ b/bins/vc/inference.py @@ -62,7 +62,7 @@ def prepare_for_audio_file(args, cfg, num_workers=1): def merge_for_audio_segments(audio_files, args, cfg): audio_name = cfg.inference.source_audio_name - target_singer_name = args.target_singer + target_singer_name = os.path.basename(args.target).split('.')[0] merge_segments_encodec( wav_files=audio_files, diff --git a/egs/vc/TransformerVC/exp_config.json b/egs/vc/TransformerVC/exp_config.json index 4c8e62af..5ac9916d 100644 --- a/egs/vc/TransformerVC/exp_config.json +++ b/egs/vc/TransformerVC/exp_config.json @@ -75,7 +75,7 @@ "gradient_accumulation_step": 1, "max_epoch": -1, // -1 means no limit "save_checkpoint_stride": [ - 50, + 5, 50 ], "keep_last": [ diff --git a/egs/vc/_template/run.sh b/egs/vc/_template/run.sh index 514534cd..64e7917e 100644 --- a/egs/vc/_template/run.sh +++ b/egs/vc/_template/run.sh @@ -40,8 +40,8 @@ while true; do # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac). --infer_source_file) shift; infer_source_file=$1 ; shift ;; --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;; - # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1". - --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;; + # [Only for Inference] Specify the audio file of the target speaker you want to convert into. + --infer_target_audio) shift; infer_target=$1 ; shift ;; # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift". --infer_key_shift) shift; infer_key_shift=$1 ; shift ;; # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders. @@ -125,7 +125,7 @@ if [ $running_stage -eq 3 ]; then fi if [ -z "$infer_target_speaker" ]; then - echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1"" + echo "[Error] Please specify the target audio file." exit 1 fi @@ -138,11 +138,11 @@ if [ $running_stage -eq 3 ]; then echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint." fi - CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \ + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vc/inference.py \ --config $exp_config \ --acoustics_dir $infer_expt_dir \ --vocoder_dir $infer_vocoder_dir \ - --target_singer $infer_target_speaker \ + --target $infer_target \ --trans_key $infer_key_shift \ --source $infer_source \ --output_dir $infer_output_dir \ diff --git a/models/base/new_inference.py b/models/base/new_inference.py index 01dce86d..a2ffec20 100644 --- a/models/base/new_inference.py +++ b/models/base/new_inference.py @@ -146,11 +146,14 @@ def inference(self): torch.save(it, os.path.join(self.args.output_dir, f"{uid}.pt")) j += 1 - vocoder_cfg, vocoder_ckpt = self._parse_vocoder(self.args.vocoder_dir) + # vocoder_cfg, vocoder_ckpt = self._parse_vocoder(self.args.vocoder_dir) + vocoder_cfg = load_config( + os.path.join(self.args.vocoder_dir, "args.json"), lowercase=True + ) res = synthesis( cfg=vocoder_cfg, - vocoder_weight_file=vocoder_ckpt, + vocoder_weight_file=self.args.vocoder_dir, n_samples=None, pred=[ torch.load( From 63ccb34b8c804c3f1d2095c1060b57b2f5c73f38 Mon Sep 17 00:00:00 2001 From: mingyang Date: Sat, 13 Jan 2024 13:02:17 +0800 Subject: [PATCH 04/12] Change input to Hubert token and normalized pitch --- bins/vc/preprocess.py | 6 +- config/transformer.json | 8 +- egs/vc/README.md | 4 +- egs/vc/TransformerVC/exp_config.json | 20 +++- models/vc/base/vc_dataset.py | 101 ++++++++++--------- models/vc/transformer/transformer_trainer.py | 4 +- modules/encoder/condition_encoder.py | 26 ++++- processors/acoustic_extractor.py | 14 +-- processors/content_extractor.py | 89 +++++++++++++++- 9 files changed, 204 insertions(+), 68 deletions(-) diff --git a/bins/vc/preprocess.py b/bins/vc/preprocess.py index 8351eb9e..641ff759 100644 --- a/bins/vc/preprocess.py +++ b/bins/vc/preprocess.py @@ -76,10 +76,7 @@ def preprocess(cfg, args): # Specify the output root path to save the processed data output_path = cfg.preprocess.processed_dir os.makedirs(output_path, exist_ok=True) - # Prepare the content features - for dataset in cfg.dataset: - print("Extracting content features for {}...".format(dataset)) - extract_content_features(dataset, output_path, cfg, args.num_workers) + ## Split train and test sets for dataset in cfg.dataset: print("Preprocess {}...".format(dataset)) @@ -88,6 +85,7 @@ def preprocess(cfg, args): cfg.dataset_path[dataset], output_path, cfg.preprocess, + cfg.task_type, is_custom_dataset=cfg.use_custom_dataset, ) diff --git a/config/transformer.json b/config/transformer.json index be3514e9..60c16127 100644 --- a/config/transformer.json +++ b/config/transformer.json @@ -24,6 +24,8 @@ "wenet_sample_rate": 16000, "extract_mert_feature": false, "mert_sample_rate": 16000, + "extract_hubert_feature": false, + "hubert_sample_rate": 16000, // Default config for whisper "whisper_frameshift": 0.01, "whisper_downsample_rate": 2, @@ -39,6 +41,8 @@ "wenet_frameshift": 0.01, // wenetspeech is 4, gigaspeech is 6 "wenet_downsample_rate": 4, + // Default config for hubert + "hubert_frameshift": 0.02, // Default config "n_mel": 100, "win_size": 1024, @@ -65,6 +69,7 @@ "contentvec_dir": "contentvec", "wenet_dir": "wenet", "mert_dir": "mert", + "hubert_dir": "hubert", // Extract content features using dataloader "pin_memory": true, "num_workers": 8, @@ -97,9 +102,10 @@ "n_bins_loudness": 256, "output_loudness_dim": 384, "use_whisper": false, - "use_contentvec": true, + "use_contentvec": false, "use_wenet": false, "use_mert": false, + "use_hubert": false, "whisper_dim": 1024, "contentvec_dim": 256, "mert_dim": 256, diff --git a/egs/vc/README.md b/egs/vc/README.md index 3207ecd7..f8be4303 100755 --- a/egs/vc/README.md +++ b/egs/vc/README.md @@ -1,8 +1,8 @@ -# Amphion Singing Voice Conversion (SVC) Recipe +# Amphion Voice Conversion (VC) Recipe ## Quick Start -We provide a **[beginner recipe](MultipleContentsSVC)** to demonstrate how to train a cutting edge SVC model. Specifically, it is also an official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Some demos can be seen [here](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html). +We provide a **[beginner recipe](MultipleContentsSVC)** to demonstrate how to train a cutting edge VC model. Some demos can be seen [here](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html). ## Supported Model Architectures diff --git a/egs/vc/TransformerVC/exp_config.json b/egs/vc/TransformerVC/exp_config.json index 5ac9916d..ce9f1149 100644 --- a/egs/vc/TransformerVC/exp_config.json +++ b/egs/vc/TransformerVC/exp_config.json @@ -15,16 +15,21 @@ "processed_dir": "data", // Config for features extraction "extract_mel": true, - "extract_pitch": false, + "extract_pitch": true, + "extract_uv": false, + "extract_duration": false, "extract_energy": false, "extract_speaker": true, - "extract_whisper_feature": true, + "extract_whisper_feature": false, "extract_contentvec_feature": false, "extract_wenet_feature": false, + "extract_hubert_feature": true, "speaker_dir": "speaker", "whisper_batch_size": 30, // decrease it if your GPU is out of memory "contentvec_batch_size": 1, // Fill in the content-based pretrained model's path + "hubert_model_path": "pretrained/hubert/mhubert_base_vp_en_es_fr_it3.pt", + "hubert_km_path": "pretrained/hubert/mhubert_base_vp_en_es_fr_it3_L11_km1000.bin", "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", @@ -33,22 +38,25 @@ // Config for features usage "use_mel": true, "use_min_max_norm_mel": true, - "use_frame_pitch": false, + "use_frame_pitch": true, "use_frame_energy": false, + "use_uv": false, "use_spkid": false, "use_spkemb": true, - "use_whisper": true, + "use_whisper": false, "use_contentvec": false, "use_wenet": false, + "use_hubert": true, "n_mel": 100, "sample_rate": 24000 }, "model": { "condition_encoder": { // Config for features usage - "use_whisper": true, + "use_whisper": false, "use_contentvec": false, "use_wenet": false, + "use_hubert": true, "spkemb_dim": 256, "whisper_dim": 1024, "contentvec_dim": 256, @@ -56,6 +64,8 @@ "use_singer_encoder": false, "pitch_min": 50, "pitch_max": 1100, + "f0_min": 0, + "f0_max": 1, "use_spkemb": true, "use_spkid": false }, diff --git a/models/vc/base/vc_dataset.py b/models/vc/base/vc_dataset.py index ca627d0c..c2ab1972 100644 --- a/models/vc/base/vc_dataset.py +++ b/models/vc/base/vc_dataset.py @@ -9,6 +9,7 @@ import json import os import numpy as np +from sklearn.preprocessing import MinMaxScaler import resemblyzer from utils.data_utils import * from processors.acoustic_extractor import cal_normalized_mel, load_mel_extrema @@ -16,6 +17,7 @@ ContentvecExtractor, WhisperExtractor, WenetExtractor, + HubertExtractor, ) from models.base.base_dataset import ( BaseCollator, @@ -55,7 +57,11 @@ def __init__(self, cfg, dataset, is_valid=False): self.utt2wenet_path = load_content_feature_path( self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.wenet_dir ) - + if cfg.model.condition_encoder.use_hubert: + self.hubert_aligner = HubertExtractor(self.cfg) + self.utt2hubert_path = load_content_feature_path( + self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.hubert_dir + ) def __getitem__(self, index): single_feature = BaseDataset.__getitem__(self, index) @@ -65,6 +71,15 @@ def __getitem__(self, index): uid = utt_info["Uid"] utt = "{}_{}".format(dataset, uid) + if self.cfg.preprocess.use_frame_pitch: + assert "frame_pitch" in single_feature.keys() + scaler = MinMaxScaler() + scaler.fit(single_feature["frame_pitch"].reshape(-1, 1)) + single_feature["frame_pitch"] = scaler.transform( + single_feature["frame_pitch"].reshape(-1, 1) + ) + single_feature["frame_pitch"] = single_feature["frame_pitch"].reshape(-1) + if self.cfg.model.condition_encoder.use_whisper: assert "target_len" in single_feature.keys() aligned_whisper_feat = self.whisper_aligner.offline_align( @@ -95,6 +110,13 @@ def __getitem__(self, index): ) single_feature["wenet_feat"] = aligned_wenet_feat + if self.cfg.model.condition_encoder.use_hubert: + assert "target_len" in single_feature.keys() + aligned_hubert_feat = self.hubert_aligner.offline_align( + np.load(self.utt2hubert_path[utt]), single_feature["target_len"] + ) + single_feature["hubert_feat"] = aligned_hubert_feat.astype(np.int32) + # print(single_feature.keys()) # for k, v in single_feature.items(): # if type(v) in [torch.Tensor, np.ndarray]: @@ -175,7 +197,6 @@ def __init__(self, args, cfg, infer_type): self.cfg = cfg self.trans_key = args.trans_key - self.target_dataset = cfg.dataset[0] if cfg.preprocess.mel_min_max_norm: self.target_mel_extrema = load_mel_extrema( @@ -194,6 +215,20 @@ def __init__(self, args, cfg, infer_type): self.spk2id = json.load(f) # print("self.spk2id", self.spk2id) + if cfg.preprocess.use_spkemb: + self.utt2spk_path = {} + for utt_info in self.metadata: + dataset = utt_info["Dataset"] + uid = utt_info["Uid"] + utt = "{}_{}".format(dataset, uid) + + self.utt2spk_path[utt] = os.path.join( + cfg.preprocess.processed_dir, + dataset, + cfg.preprocess.speaker_dir, + uid + ".npy", + ) + if cfg.preprocess.use_uv: self.utt2uv_path = { f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join( @@ -216,34 +251,6 @@ def __init__(self, args, cfg, infer_type): for utt_info in self.metadata } - # Target F0 median - target_f0_statistics_path = os.path.join( - cfg.preprocess.processed_dir, - self.target_dataset, - cfg.preprocess.pitch_dir, - "statistics.json", - ) - self.target_pitch_median = json.load(open(target_f0_statistics_path, "r"))[ - f"{self.target_dataset}_{self.target_singer}" - ]["voiced_positions"]["median"] - - # Source F0 median (if infer from file) - if infer_type == "from_file": - source_audio_name = cfg.inference.source_audio_name - source_f0_statistics_path = os.path.join( - cfg.preprocess.processed_dir, - source_audio_name, - cfg.preprocess.pitch_dir, - "statistics.json", - ) - self.source_pitch_median = json.load( - open(source_f0_statistics_path, "r") - )[f"{source_audio_name}_{source_audio_name}"]["voiced_positions"][ - "median" - ] - else: - self.source_pitch_median = None - if cfg.preprocess.use_frame_energy: self.utt2frame_energy_path = { f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join( @@ -290,6 +297,11 @@ def __init__(self, args, cfg, infer_type): self.utt2wenet_path = load_content_feature_path( self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.wenet_dir ) + if cfg.model.condition_encoder.use_hubert: + self.hubert_aligner = HubertExtractor(cfg) + self.utt2hubert_path = load_content_feature_path( + self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.hubert_dir + ) def __getitem__(self, index): single_feature = {} @@ -308,7 +320,7 @@ def __getitem__(self, index): ) if self.cfg.preprocess.use_spkemb: - voice_encoder = resemblyzer.VoiceEncoder('cpu', verbose=False) + voice_encoder = resemblyzer.VoiceEncoder("cpu", verbose=False) target_wav = resemblyzer.preprocess_wav(self.target) single_feature["spkemb"] = voice_encoder.embed_utterance(target_wav) @@ -328,26 +340,18 @@ def __getitem__(self, index): frame_pitch_path = self.utt2frame_pitch_path[utt] frame_pitch = np.load(frame_pitch_path) - if self.trans_key: - try: - self.trans_key = int(self.trans_key) - except: - pass - if type(self.trans_key) == int: - frame_pitch = transpose_key(frame_pitch, self.trans_key) - elif self.trans_key: - assert self.target_singer - - frame_pitch = pitch_shift_to_target( - frame_pitch, self.target_pitch_median, self.source_pitch_median - ) - if "target_len" not in single_feature.keys(): single_feature["target_len"] = len(frame_pitch) aligned_frame_pitch = align_length( frame_pitch, single_feature["target_len"] ) single_feature["frame_pitch"] = aligned_frame_pitch + scaler = MinMaxScaler() + scaler.fit(single_feature["frame_pitch"].reshape(-1, 1)) + single_feature["frame_pitch"] = scaler.transform( + single_feature["frame_pitch"].reshape(-1, 1) + ) + single_feature["frame_pitch"] = single_feature["frame_pitch"].reshape(-1) if self.cfg.preprocess.use_uv: frame_uv_path = self.utt2uv_path[utt] @@ -400,6 +404,13 @@ def __getitem__(self, index): ) single_feature["wenet_feat"] = aligned_wenet_feat + if self.cfg.model.condition_encoder.use_hubert: + assert "target_len" in single_feature.keys() + aligned_hubert_feat = self.hubert_aligner.offline_align( + np.load(self.utt2hubert_path[utt]), single_feature["target_len"] + ) + single_feature["hubert_feat"] = aligned_hubert_feat.astype(np.int32) + return single_feature def __len__(self): diff --git a/models/vc/transformer/transformer_trainer.py b/models/vc/transformer/transformer_trainer.py index 24f90572..247db706 100644 --- a/models/vc/transformer/transformer_trainer.py +++ b/models/vc/transformer/transformer_trainer.py @@ -18,8 +18,8 @@ def __init__(self, args, cfg): self.ssim_loss = SSIM() def _build_model(self): - self.cfg.model.condition_encoder.f0_min = self.cfg.preprocess.f0_min - self.cfg.model.condition_encoder.f0_max = self.cfg.preprocess.f0_max + # self.cfg.model.condition_encoder.f0_min = self.cfg.preprocess.f0_min + # self.cfg.model.condition_encoder.f0_max = self.cfg.preprocess.f0_max self.condition_encoder = ConditionEncoder(self.cfg.model.condition_encoder) if self.cfg.model.transformer.type == "transformer": self.acoustic_mapper = Transformer(self.cfg.model.transformer) diff --git a/modules/encoder/condition_encoder.py b/modules/encoder/condition_encoder.py index db7d82c6..f274e870 100644 --- a/modules/encoder/condition_encoder.py +++ b/modules/encoder/condition_encoder.py @@ -52,6 +52,8 @@ def __init__(self, cfg): self.input_dim = self.cfg.input_melody_dim self.output_dim = self.cfg.output_melody_dim self.n_bins = self.cfg.n_bins_melody + self.pitch_min = self.cfg.pitch_min + self.pitch_max = self.cfg.pitch_max if self.input_dim != 0: if self.n_bins == 0: @@ -167,6 +169,15 @@ def __init__(self, cfg): self.wenet_encoder = ContentEncoder( self.cfg, self.cfg.wenet_dim, self.cfg.content_encoder_dim ) + if cfg.use_hubert: + self.hubert_lookup = nn.Embedding( + num_embeddings=1000, + embedding_dim=self.cfg.content_encoder_dim, + padding_idx=None, + ) + self.hubert_encoder = ContentEncoder( + self.cfg, self.cfg.content_encoder_dim, self.cfg.content_encoder_dim + ) ### Prosody Features ### if cfg.use_f0: @@ -178,7 +189,9 @@ def __init__(self, cfg): if cfg.use_spkid: self.singer_encoder = SingerEncoder(self.cfg) if cfg.use_spkemb: - self.speaker_project = nn.Linear(self.cfg.spkemb_dim, self.cfg.content_encoder_dim) + self.speaker_project = nn.Linear( + self.cfg.spkemb_dim, self.cfg.content_encoder_dim + ) def forward(self, x): outputs = [] @@ -223,6 +236,12 @@ def forward(self, x): outputs.append(wenet_enc_out) seq_len = wenet_enc_out.shape[1] + if self.cfg.use_hubert: + hubert_enc_out = self.hubert_lookup(x["hubert_feat"].squeeze(-1)) + hubert_enc_out = self.hubert_encoder(hubert_enc_out, length=x["target_len"]) + outputs.append(hubert_enc_out) + seq_len = hubert_enc_out.shape[1] + if self.cfg.use_spkid: speaker_enc_out = self.singer_encoder(x["spk_id"]) # [b, 1, 384] assert ( @@ -230,12 +249,15 @@ def forward(self, x): or "contentvec_feat" in x.keys() or "mert_feat" in x.keys() or "wenet_feat" in x.keys() + or "hubert_feat" in x.keys() ) singer_info = speaker_enc_out.expand(-1, seq_len, -1) outputs.append(singer_info) if "spkemb" in x.keys(): - speaker_embedding = self.speaker_project(x["spkemb"].unsqueeze(1)) # [b, 1, 384] + speaker_embedding = self.speaker_project( + x["spkemb"].unsqueeze(1) + ) # [b, 1, 384] speaker_embedding = speaker_embedding.expand(-1, seq_len, -1) outputs.append(speaker_embedding) diff --git a/processors/acoustic_extractor.py b/processors/acoustic_extractor.py index 5e0d7551..be3d4a74 100644 --- a/processors/acoustic_extractor.py +++ b/processors/acoustic_extractor.py @@ -120,6 +120,14 @@ def __extract_utt_acoustic_features(dataset_output, cfg, utt): wav = wav_torch.cpu().numpy() # extract features + if cfg.preprocess.extract_speaker: + voice_encoder = resemblyzer.VoiceEncoder("cpu", verbose=False) + speaker_wav = resemblyzer.preprocess_wav(wav_path) + speaker_embedding = voice_encoder.embed_utterance(speaker_wav) + save_feature( + dataset_output, cfg.preprocess.speaker_dir, uid, speaker_embedding + ) + if cfg.preprocess.extract_duration: durations, phones, start, end = duration.get_duration( utt, wav, cfg.preprocess @@ -263,12 +271,6 @@ def extract_utt_acoustic_features_tts(dataset_output, cfg, utt): wav = wav[start:end].astype(np.float32) wav_torch = torch.from_numpy(wav).to(wav_torch.device) - if cfg.preprocess.extract_speaker: - voice_encoder = resemblyzer.VoiceEncoder(verbose=False) - speaker_wav = resemblyzer.preprocess_wav(wav_path) - speaker_embedding = voice_encoder.embed_utterance(speaker_wav) - save_feature(dataset_output, cfg.preprocess.speaker_dir, uid, speaker_embedding) - if cfg.preprocess.extract_linear_spec: from utils.mel import extract_linear_features diff --git a/processors/content_extractor.py b/processors/content_extractor.py index 34b54917..6a5b2e36 100644 --- a/processors/content_extractor.py +++ b/processors/content_extractor.py @@ -8,6 +8,7 @@ import numpy as np import yaml import copy +import joblib from tqdm import tqdm from torchaudio.compliance import kaldi from torch.nn.utils.rnn import pad_sequence @@ -68,7 +69,7 @@ def __init__(self, cfg, extractor_type): def init_for_retrans(self): target_hop = self.cfg.preprocess.hop_size - assert self.extractor_type in ["whisper", "contentvec", "wenet"] + assert self.extractor_type in ["whisper", "contentvec", "wenet", "hubert"] if self.extractor_type == "whisper": source_hop = ( self.cfg.preprocess.whisper_frameshift @@ -86,6 +87,10 @@ def init_for_retrans(self): * self.cfg.preprocess.wenet_downsample_rate * self.cfg.preprocess.sample_rate ) + elif self.extractor_type == "hubert": + source_hop = ( + self.cfg.preprocess.hubert_frameshift * self.cfg.preprocess.sample_rate + ) source_hop = int(source_hop) factor = np.gcd(source_hop, target_hop) source_hop //= factor @@ -230,6 +235,8 @@ def get_valid_features(self, utt, content_feature): ) # 40ms elif self.extractor_type == "mert": frameshift = self.cfg.preprocess.mert_frameshift + elif self.extractor_type == "hubert": + frameshift = self.cfg.preprocess.hubert_frameshift else: raise NotImplementedError @@ -495,6 +502,53 @@ def extract_content_features(self, wavs): return mert_features +class HubertExtractor(BaseExtractor): + def __init__(self, cfg): + super(HubertExtractor, self).__init__(cfg) + self.extractor_type = "hubert" + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + def load_model(self): + # load whisper checkpoint + print("Loading Hubert Model...") + (model, _, _) = checkpoint_utils.load_model_ensemble_and_task( + [self.cfg.preprocess.hubert_model_path] + ) + model = model[0] + if torch.cuda.is_available(): + print("Using GPU...\n") + model = model.cuda() + else: + print("Using CPU...\n") + + self.model = model.eval() + self.km_model = joblib.load(self.cfg.preprocess.hubert_km_path) + self.C_np = self.km_model.cluster_centers_.transpose() + self.Cnorm_np = (self.C_np**2).sum(0, keepdims=True) + self.C = torch.from_numpy(self.C_np) + self.Cnorm = torch.from_numpy(self.Cnorm_np) + + def extract_content_features(self, wavs): + hubert_features = [] + for wav in wavs: + feat, _ = self.model.extract_features( + source=wav.view(1, -1).to(self.device), + padding_mask=None, + mask=False, + output_layer=11, + ) + feat = feat.squeeze(0).cpu().detach() + dist = ( + feat.pow(2).sum(1, keepdim=True) + - 2 * torch.matmul(feat, self.C) + + self.Cnorm + ) + feat = dist.argmin(dim=1).unsqueeze(-1) + hubert_features.append(feat) + + return hubert_features + + def extract_utt_content_features_dataloader(cfg, metadata, num_workers): dataset_name = metadata[0]["Dataset"] with torch.no_grad(): @@ -624,3 +678,36 @@ def extract_utt_content_features_dataloader(cfg, metadata, num_workers): batch_content_features = extractor.extract_content_features(wavs) for index, utt in enumerate(_metadata): extractor.save_feature(utt, batch_content_features[index]) + + if cfg.preprocess.extract_hubert_feature: + feat_dir = os.path.join( + cfg.preprocess.processed_dir, dataset_name, "hubert" + ) + os.makedirs(feat_dir, exist_ok=True) + feat_files_num = len(os.listdir(feat_dir)) + if feat_files_num != len(metadata): + hubert_waveforms = LibrosaDataset( + cfg, + dataset_name, + cfg.preprocess.hubert_sample_rate, + metadata=metadata, + ) + data_loader = DataLoader( + hubert_waveforms, + num_workers=num_workers, + shuffle=False, + pin_memory=cfg.preprocess.pin_memory, + batch_size=cfg.preprocess.content_feature_batch_size, + collate_fn=collate_batch, + drop_last=False, + ) + extractor = HubertExtractor(cfg) + extractor.load_model() + for batch_idx, items in enumerate(tqdm(data_loader)): + _metadata, wavs, lens = items + + batch_content_features = extractor.extract_content_features( + wavs, + ) + for index, utt in enumerate(_metadata): + extractor.save_feature(utt, batch_content_features[index]) From dbfd101a569100b64e35f362c94f44686674a61b Mon Sep 17 00:00:00 2001 From: mingyang Date: Mon, 15 Jan 2024 10:03:37 +0800 Subject: [PATCH 05/12] Change input to Hubert token and normalized pitch --- bins/vc/inference.py | 2 +- models/base/new_inference.py | 7 ++-- models/vc/base/vc_inference.py | 59 ++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 6 deletions(-) diff --git a/bins/vc/inference.py b/bins/vc/inference.py index f5bf40ee..05663540 100644 --- a/bins/vc/inference.py +++ b/bins/vc/inference.py @@ -62,7 +62,7 @@ def prepare_for_audio_file(args, cfg, num_workers=1): def merge_for_audio_segments(audio_files, args, cfg): audio_name = cfg.inference.source_audio_name - target_singer_name = os.path.basename(args.target).split('.')[0] + target_singer_name = os.path.basename(args.target).split(".")[0] merge_segments_encodec( wav_files=audio_files, diff --git a/models/base/new_inference.py b/models/base/new_inference.py index a2ffec20..01dce86d 100644 --- a/models/base/new_inference.py +++ b/models/base/new_inference.py @@ -146,14 +146,11 @@ def inference(self): torch.save(it, os.path.join(self.args.output_dir, f"{uid}.pt")) j += 1 - # vocoder_cfg, vocoder_ckpt = self._parse_vocoder(self.args.vocoder_dir) - vocoder_cfg = load_config( - os.path.join(self.args.vocoder_dir, "args.json"), lowercase=True - ) + vocoder_cfg, vocoder_ckpt = self._parse_vocoder(self.args.vocoder_dir) res = synthesis( cfg=vocoder_cfg, - vocoder_weight_file=self.args.vocoder_dir, + vocoder_weight_file=vocoder_ckpt, n_samples=None, pred=[ torch.load( diff --git a/models/vc/base/vc_inference.py b/models/vc/base/vc_inference.py index a97b36ed..c7eee255 100644 --- a/models/vc/base/vc_inference.py +++ b/models/vc/base/vc_inference.py @@ -2,10 +2,19 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +import os +import torch from models.base.new_inference import BaseInference from models.vc.base.vc_dataset import VCTestCollator, VCTestDataset +from utils.io import save_audio +from utils.util import load_config +from utils.audio_slicer import is_silence +from models.vocoders.vocoder_inference import synthesis + +EPS = 1.0e-12 + class VCInference(BaseInference): def __init__(self, args=None, cfg=None, infer_type="from_dataset"): @@ -13,3 +22,53 @@ def __init__(self, args=None, cfg=None, infer_type="from_dataset"): def _build_test_dataset(self): return VCTestDataset, VCTestCollator + + @torch.inference_mode() + def inference(self): + for i, batch in enumerate(self.test_dataloader): + y_pred = self._inference_each_batch(batch).cpu() + mel_min, mel_max = self.test_dataset.target_mel_extrema + y_pred = (y_pred + 1.0) / 2.0 * (mel_max - mel_min + EPS) + mel_min + y_ls = y_pred.chunk(self.test_batch_size) + tgt_ls = batch["target_len"].cpu().chunk(self.test_batch_size) + j = 0 + for it, l in zip(y_ls, tgt_ls): + l = l.item() + it = it.squeeze(0)[:l] + uid = self.test_dataset.metadata[i * self.test_batch_size + j]["Uid"] + torch.save(it, os.path.join(self.args.output_dir, f"{uid}.pt")) + j += 1 + + vocoder_cfg = load_config( + os.path.join(self.args.vocoder_dir, "args.json"), lowercase=True + ) + + res = synthesis( + cfg=vocoder_cfg, + vocoder_weight_file=self.args.vocoder_dir, + n_samples=None, + pred=[ + torch.load( + os.path.join(self.args.output_dir, "{}.pt".format(i["Uid"])) + ).numpy(force=True) + for i in self.test_dataset.metadata + ], + ) + + output_audio_files = [] + for it, wav in zip(self.test_dataset.metadata, res): + uid = it["Uid"] + file = os.path.join(self.args.output_dir, f"{uid}.wav") + output_audio_files.append(file) + + wav = wav.numpy(force=True) + save_audio( + file, + wav, + self.cfg.preprocess.sample_rate, + add_silence=False, + turn_up=not is_silence(wav, self.cfg.preprocess.sample_rate), + ) + os.remove(os.path.join(self.args.output_dir, f"{uid}.pt")) + + return sorted(output_audio_files) From 3574f870d1d3b809653b94b79dc8bc05baaa2261 Mon Sep 17 00:00:00 2001 From: mingyang Date: Fri, 26 Jan 2024 12:35:38 +0800 Subject: [PATCH 06/12] Add implement of VitsVC --- bins/vc/inference.py | 2 + bins/vc/train.py | 5 +- config/vitsvc.json | 193 +++++++++++ egs/vc/TransformerVC/README.md | 52 ++- egs/vc/VitsVC/README.md | 115 +++++++ egs/vc/VitsVC/exp_config.json | 167 +++++++++ egs/vc/VitsVC/run.sh | 1 + models/vc/base/vc_dataset.py | 9 + models/vc/vits/__init__.py | 0 models/vc/vits/vits.py | 294 ++++++++++++++++ models/vc/vits/vits_inference.py | 84 +++++ models/vc/vits/vits_trainer.py | 564 +++++++++++++++++++++++++++++++ 12 files changed, 1452 insertions(+), 34 deletions(-) create mode 100644 config/vitsvc.json create mode 100644 egs/vc/VitsVC/README.md create mode 100644 egs/vc/VitsVC/exp_config.json create mode 120000 egs/vc/VitsVC/run.sh create mode 100644 models/vc/vits/__init__.py create mode 100644 models/vc/vits/vits.py create mode 100644 models/vc/vits/vits_inference.py create mode 100644 models/vc/vits/vits_trainer.py diff --git a/bins/vc/inference.py b/bins/vc/inference.py index 05663540..f5977902 100644 --- a/bins/vc/inference.py +++ b/bins/vc/inference.py @@ -13,6 +13,7 @@ from models.vc.transformer.transformer_inference import TransformerInference +from models.vc.vits.vits_inference import VitsInference from utils.util import load_config from utils.audio_slicer import split_audio, merge_segments_encodec from processors import acoustic_extractor, content_extractor @@ -21,6 +22,7 @@ def build_inference(args, cfg, infer_type="from_dataset"): supported_inference = { "TransformerVC": TransformerInference, + "VitsVC": VitsInference, } inference_class = supported_inference[cfg.model_type] diff --git a/bins/vc/train.py b/bins/vc/train.py index ac0be689..e0ddeaee 100644 --- a/bins/vc/train.py +++ b/bins/vc/train.py @@ -9,13 +9,12 @@ from models.vc.transformer.transformer_trainer import TransformerTrainer +from models.vc.vits.vits_trainer import VitsVCTrainer from utils.util import load_config def build_trainer(args, cfg): - supported_trainer = { - "TransformerVC": TransformerTrainer, - } + supported_trainer = {"TransformerVC": TransformerTrainer, "VitsVC": VitsVCTrainer} trainer_class = supported_trainer[cfg.model_type] trainer = trainer_class(args, cfg) diff --git a/config/vitsvc.json b/config/vitsvc.json new file mode 100644 index 00000000..c56de793 --- /dev/null +++ b/config/vitsvc.json @@ -0,0 +1,193 @@ +{ + "base_config": "config/base.json", + "model_type": "VITS", + "task_type": "svc", + "preprocess": { + "extract_phone": false, + "extract_mel": true, + "extract_linear_spec": true, + "extract_audio": true, + "use_linear": true, + "use_mel": true, + "use_audio": true, + "use_text": false, + "use_phone": true, + + "fmin": 0, + "fmax": null, + "f0_min": 50, + "f0_max": 1100, + // f0_bin in sovits + "pitch_bin": 256, + // filter_length in sovits + "n_fft": 2048, + // hop_length in sovits + "hop_size": 512, + // win_length in sovits + "win_size": 2048, + "segment_size": 8192, + "n_mel": 100, + "sample_rate": 44100, + + "mel_min_max_stats_dir": "mel_min_max_stats", + "whisper_dir": "whisper", + "contentvec_dir": "contentvec", + "wenet_dir": "wenet", + "mert_dir": "mert", + "hubert_dir": "hubert", + }, + "model": { + "condition_encoder": { + "merge_mode": "add", + "input_melody_dim": 1, + "use_log_f0": true, + "n_bins_melody": 256, + //# Quantization (0 for not quantization) + "output_melody_dim": 196, + "input_loudness_dim": 1, + "use_log_loudness": false, + "n_bins_loudness": 256, + "output_loudness_dim": 196, + "use_whisper": false, + "use_contentvec": false, + "use_wenet": false, + "use_mert": false, + "whisper_dim": 1024, + "contentvec_dim": 256, + "mert_dim": 256, + "wenet_dim": 512, + "content_encoder_dim": 196, + "output_singer_dim": 196, + "singer_table_size": 512, + "output_content_dim": 196, + "use_spkid": true + }, + "vits": { + "filter_channels": 256, + "gin_channels": 256, + "hidden_channels": 192, + "inter_channels": 192, + "kernel_size": 3, + "n_flow_layer": 4, + "n_heads": 2, + "n_layers": 6, + "n_layers_q": 3, + "n_speakers": 512, + "p_dropout": 0.1, + "ssl_dim": 256, + "use_spectral_norm": false, + }, + "generator": "hifigan", + "generator_config": { + "hifigan": { + "resblock": "1", + "resblock_kernel_sizes": [ + 3, + 5, + 7 + ], + "upsample_rates": [ + 8,4,2,2,2 + ], + "upsample_kernel_sizes": [ + 16,8,4,4,4 + ], + "upsample_initial_channel": 512, + "resblock_dilation_sizes": [ + [1,3,5], + [1,3,5], + [1,3,5] + ] + }, + "melgan": { + "ratios": [8, 8, 2, 2, 2], + "ngf": 32, + "n_residual_layers": 3, + "num_D": 3, + "ndf": 16, + "n_layers": 4, + "downsampling_factor": 4 + }, + "bigvgan": { + "resblock": "1", + "activation": "snakebeta", + "snake_logscale": true, + "upsample_rates": [ + 8,8,2,2,2, + ], + "upsample_kernel_sizes": [ + 16,16,4,4,4, + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [1,3,5], + [1,3,5], + [1,3,5] + ] + }, + "nsfhifigan": { + "resblock": "1", + "harmonic_num": 8, + "upsample_rates": [ + 8,4,2,2,2, + ], + "upsample_kernel_sizes": [ + 16,8,4,4,4, + ], + "upsample_initial_channel": 768, + "resblock_kernel_sizes": [ + 3, + 5, + 7 + ], + "resblock_dilation_sizes": [ + [1,3,5], + [1,3,5], + [1,3,5] + ] + }, + "apnet": { + "ASP_channel": 512, + "ASP_resblock_kernel_sizes": [3,7,11], + "ASP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "ASP_input_conv_kernel_size": 7, + "ASP_output_conv_kernel_size": 7, + + "PSP_channel": 512, + "PSP_resblock_kernel_sizes": [3,7,11], + "PSP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "PSP_input_conv_kernel_size": 7, + "PSP_output_R_conv_kernel_size": 7, + "PSP_output_I_conv_kernel_size": 7, + } + }, + }, + "train": { + "fp16_run": true, + "learning_rate": 2e-4, + "betas": [ + 0.8, + 0.99 + ], + "eps": 1e-9, + "batch_size": 16, + "lr_decay": 0.999875, + // "segment_size": 8192, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0, + "AdamW": { + "betas": [ + 0.8, + 0.99 + ], + "eps": 1e-9, + } + } +} \ No newline at end of file diff --git a/egs/vc/TransformerVC/README.md b/egs/vc/TransformerVC/README.md index 1797e32f..0df6692d 100644 --- a/egs/vc/TransformerVC/README.md +++ b/egs/vc/TransformerVC/README.md @@ -1,6 +1,6 @@ -# Transformer for Singing Voice Conversion +# Transformer for Voice Conversion -This is an implementation of **vanilla transformer encoder**/**conformer** as acoustic model for singing voice conversion. +This is an implementation of **vanilla transformer encoder**/**conformer** as acoustic model for voice conversion. There are four stages in total: @@ -18,7 +18,7 @@ There are four stages in total: ### Dataset Download -By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md). +By default, we utilize the LibriTTS datasets for training. How to download them is detailed [here](../../datasets/README.md). ### Configuration @@ -26,19 +26,11 @@ Specify the dataset paths in `exp_config.json`. Note that you can change the `d ```json "dataset": [ - "m4singer", - "opencpop", - "opensinger", - "svcc", - "vctk" + "libritts" ], "dataset_path": { // TODO: Fill in your dataset path - "m4singer": "[M4Singer dataset path]", - "opencpop": "[Opencpop dataset path]", - "opensinger": "[OpenSinger dataset path]", - "svcc": "[SVCC dataset path]", - "vctk": "[VCTK dataset path]" + "libritts": "[LibriTTS dataset path]" }, ``` @@ -46,7 +38,7 @@ Specify the dataset paths in `exp_config.json`. Note that you can change the `d ### Content-based Pretrained Models Download -By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md). +By default, we utilize the Hubert to extract content features. How to download them is detailed [here](../../../pretrained/README.md). ### Configuration @@ -54,7 +46,7 @@ Specify the dataset path and the output path for saving the processed data and t ```json // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" - "log_dir": "ckpts/svc", + "log_dir": "ckpts/vc", "preprocess": { // TODO: Fill in the output data path. The default value is "Amphion/data" "processed_dir": "data", @@ -67,7 +59,7 @@ Specify the dataset path and the output path for saving the processed data and t Run the `run.sh` as the preproces stage (set `--stage 1`). ```bash -sh egs/svc/TransformerSVC/run.sh --stage 1 +sh egs/vc/TransformerVC/run.sh --stage 1 ``` > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. @@ -106,10 +98,10 @@ We provide the default hyparameters in the `exp_config.json`. They can work on s ### Run -Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`. +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/vc/[YourExptName]`. ```bash -sh egs/svc/TransformerSVC/run.sh --stage 2 --name [YourExptName] +sh egs/vc/TransformerVC/run.sh --stage 2 --name [YourExptName] ``` > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. @@ -124,24 +116,22 @@ We fine-tune the official BigVGAN pretrained model with over 120 hours singing v For inference/conversion, you need to specify the following configurations when running `run.sh`: -| Parameters | Description | Example | -| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` | -| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` | -| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). | -| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. | -| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. | +| Parameters | Description | Example | +| --------------------------------------------------- |---------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` | +| `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` | +| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). | +| `--infer_target_speaker` | The audio file of the target speaker you want to convert into.| `[Your path to the target audio file]` | -For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run: +For example, if you want to make the speaker in `reference.wav` to speake the utterances in the `[Your Audios Folder]`, just run: ```bash cd Amphion -sh egs/svc/TransformerSVC/run.sh --stage 3 --gpu "0" \ - --infer_expt_dir Amphion/ckpts/svc/[YourExptName] \ - --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \ +sh egs/vc/TransformerVC/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir Amphion/ckpts/vc/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vc/[YourExptName]/result \ --infer_source_audio_dir [Your Audios Folder] \ - --infer_target_speaker "opencpop_female1" \ - --infer_key_shift "autoshift" + --infer_target_speaker "reference.wav" ``` ## Citations diff --git a/egs/vc/VitsVC/README.md b/egs/vc/VitsVC/README.md new file mode 100644 index 00000000..c3990cb3 --- /dev/null +++ b/egs/vc/VitsVC/README.md @@ -0,0 +1,115 @@ +# VITS for Voice Conversion + +This is an implementation of VITS as acoustic model for end-to-end voice conversion. Adapted from [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc), SoftVC content encoder is used to extract content features from the source audio. These feature vectors are directly fed into VITS without the need for conversion to a text-based intermediate representation. + +There are four stages in total: + +1. Data preparation +2. Features extraction +3. Training +4. Inference/conversion + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +### Dataset Download + +By default, we utilize the LibriTTS datasets for training. How to download them is detailed [here](../../datasets/README.md). + +### Configuration + +Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json + "dataset": [ + "libritts" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "libritts": "[LibriTTS dataset path]" + }, +``` + +## 2. Features Extraction + +### Content-based Pretrained Models Download + +By default, we utilize Hubert to extract content features. How to download them is detailed [here](../../../pretrained/README.md). + +### Configuration + +Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`: + +```json + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "ckpts/vc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`). + +```bash +sh egs/vc/VitsVC/run.sh --stage 1 +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + +## 3. Training + +### Configuration + +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines. + +```json +"train": { + "batch_size": 32, + ... + "adamw": { + "lr": 2.0e-4 + }, + ... + } +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`. + +```bash +sh egs/vc/VitsVC/run.sh --stage 2 --name [YourExptName] +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + +## 4. Inference/Conversion + +### Run + +For inference/conversion, you need to specify the following configurations when running `run.sh`: + +| Parameters | Description | Example | +| --------------------------------------------------- |---------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` | +| `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` | +| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). | +| `--infer_target_speaker` | The audio file of the target speaker you want to convert into.| `[Your path to the target audio file]` | + +For example, if you want to make the speaker in `reference.wav` to speake the utterances in the `[Your Audios Folder]`, just run: + +```bash +sh egs/vc/VitsVC/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir Amphion/ckpts/vc/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vc/[YourExptName]/result \ + --infer_source_audio_dir [Your Audios Folder] \ + --infer_target_speaker "reference.wav" +``` \ No newline at end of file diff --git a/egs/vc/VitsVC/exp_config.json b/egs/vc/VitsVC/exp_config.json new file mode 100644 index 00000000..a6497a5c --- /dev/null +++ b/egs/vc/VitsVC/exp_config.json @@ -0,0 +1,167 @@ +{ + "base_config": "config/vitsvc.json", + "model_type": "VitsVC", + "dataset": [ + "libritts" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "libritts": "/home/mingyang/LibriTTS/LibriTTS" + }, + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "ckpts/vc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + "speaker_dir": "speaker", + "f0_min": 50, + "f0_max": 1100, + // f0_bin in sovits + "pitch_bin": 256, + // filter_length in sovits + "n_fft": 1024, + // hop_length in sovits + "hop_size": 256, + // win_length in sovits + "win_size": 1024, + "segment_size": 8192, + "n_mel": 100, + "sample_rate": 24000, + + // Config for features extraction + "extract_mel": true, + "extract_pitch": true, + "pitch_extractor": "parselmouth", + "extract_energy": false, + "extract_speaker": true, + "extract_uv": false, + "extract_linear_spec": true, + "extract_audio": true, + // contentvec + "extract_contentvec_feature": false, + "contentvec_sample_rate": 16000, + "contentvec_batch_size": 1, + "contentvec_frameshift": 0.02, + // whisper + "extract_whisper_feature": false, + "whisper_sample_rate": 16000, + "whisper_frameshift": 0.01, + "whisper_downsample_rate": 2, + // hubert + "extract_hubert_feature": true, + "hubert_sample_rate": 16000, + "hubert_frameshift": 0.02, + // Fill in the content-based pretrained model's path + "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", + "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", + "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", + "whisper_model": "medium", + "whisper_model_path": "pretrained/whisper/medium.pt", + "hubert_model_path": "pretrained/hubert/mhubert_base_vp_en_es_fr_it3.pt", + "hubert_km_path": "pretrained/hubert/mhubert_base_vp_en_es_fr_it3_L11_km1000.bin", + // Config for features usage + "use_mel": true, + "use_frame_pitch": true, + "use_uv": false, + "use_spkid": false, + "use_spkemb": true, + "use_contentvec": false, + "use_whisper": false, + "use_hubert": true, + "use_text": false, + "use_phone": false, + + // Extract content features using dataloader + "pin_memory": true, + "num_workers": 8, + "content_feature_batch_size": 16, + // Meta file + "train_file": "train.json", + "valid_file": "test.json", + "spk2id": "singers.json", + "utt2spk": "utt2singer" + }, + "model": { + "condition_encoder": { + // Config for features usage + "merge_mode": "add", + "input_melody_dim": 1, + "use_log_f0": true, + "n_bins_melody": 256, + //# Quantization (0 for not quantization) + "output_melody_dim": 192, + + "use_contentvec": false, + "use_whisper": false, + "use_mert": false, + "use_wenet": false, + "use_hubert": true, + "whisper_dim": 1024, + "contentvec_dim": 256, + "content_encoder_dim": 192, + "output_singer_dim": 192, + "singer_table_size": 512, + "output_content_dim": 192, + "use_spkid": false, + "use_spkemb": true, + "spkemb_dim": 256, + "f0_min": 0, + "f0_max": 1, + "pitch_max": 1100.0, + "pitch_min": 50.0 + }, + "vits": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 256, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "ssl_dim": 256, + "n_flow_layer": 4, + "n_layers_q": 3, + "gin_channels": 256, + "n_speakers": 512, + "use_spectral_norm": false + }, + "generator": "nsfhifigan" + }, + "train": { + "batch_size": 32, + "learning_rate": 2e-4, + "gradient_accumulation_step": 1, + "max_epoch": -1, // -1 means no limit + "save_checkpoint_stride": [ + 3, + 50 + ], + "keep_last": [ + 3, + 2 + ], + "run_eval": [ + true, + true + ], + "adamw": { + "lr": 2.0e-4 + }, + "reducelronplateau": { + "factor": 0.8, + "patience": 30, + "min_lr": 1.0e-4 + }, + "dataloader": { + "num_worker": 8, + "pin_memory": true + }, + "sampler": { + "holistic_shuffle": false, + "drop_last": true + } + }, + "inference": { + "batch_size": 1 + } +} \ No newline at end of file diff --git a/egs/vc/VitsVC/run.sh b/egs/vc/VitsVC/run.sh new file mode 120000 index 00000000..f8daac3d --- /dev/null +++ b/egs/vc/VitsVC/run.sh @@ -0,0 +1 @@ +../_template/run.sh \ No newline at end of file diff --git a/models/vc/base/vc_dataset.py b/models/vc/base/vc_dataset.py index c2ab1972..65bb0703 100644 --- a/models/vc/base/vc_dataset.py +++ b/models/vc/base/vc_dataset.py @@ -33,6 +33,15 @@ def __init__(self, cfg, dataset, is_valid=False): BaseDataset.__init__(self, cfg, dataset, is_valid=is_valid) cfg = self.cfg + if cfg.preprocess.segment_size is not None: + metadata_new = [] + for item in self.metadata: + if ( + item["Duration"] * cfg.preprocess.sample_rate + > cfg.preprocess.segment_size + ): + metadata_new.append(item) + self.metadata = metadata_new if cfg.model.condition_encoder.use_whisper: self.whisper_aligner = WhisperExtractor(self.cfg) diff --git a/models/vc/vits/__init__.py b/models/vc/vits/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/models/vc/vits/vits.py b/models/vc/vits/vits.py new file mode 100644 index 00000000..29088b42 --- /dev/null +++ b/models/vc/vits/vits.py @@ -0,0 +1,294 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# This code is modified from https://github.com/svc-develop-team/so-vits-svc/blob/4.1-Stable/models.py +import copy +import torch +from torch import nn +from torch.nn import functional as F + +from utils.util import * +from utils.f0 import f0_to_coarse + +from modules.transformer.attentions import Encoder +from models.tts.vits.vits import ResidualCouplingBlock, PosteriorEncoder +from models.vocoders.gan.generator.bigvgan import BigVGAN +from models.vocoders.gan.generator.hifigan import HiFiGAN +from models.vocoders.gan.generator.nsfhifigan import NSFHiFiGAN +from models.vocoders.gan.generator.melgan import MelGAN +from models.vocoders.gan.generator.apnet import APNet +from modules.encoder.condition_encoder import ConditionEncoder + + +def slice_pitch_segments(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, idx_str:idx_end] + return ret + + +def rand_slice_segments_with_pitch(x, pitch, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + ret_pitch = slice_pitch_segments(pitch, ids_str, segment_size) + return ret, ret_pitch, ids_str + + +class ContentEncoder(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + kernel_size, + n_layers, + gin_channels=0, + filter_channels=None, + n_heads=None, + p_dropout=None, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.f0_emb = nn.Embedding(256, hidden_channels) + + self.enc_ = Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + # condition_encoder ver. + def forward(self, x, x_mask, noice_scale=1): + x = self.enc_(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask + + return z, m, logs, x_mask + + +class SynthesizerTrn(nn.Module): + """ + Synthesizer for Training + """ + + def __init__(self, spec_channels, segment_size, cfg): + super().__init__() + self.spec_channels = spec_channels + self.segment_size = segment_size + self.cfg = cfg + self.inter_channels = cfg.model.vits.inter_channels + self.hidden_channels = cfg.model.vits.hidden_channels + self.filter_channels = cfg.model.vits.filter_channels + self.n_heads = cfg.model.vits.n_heads + self.n_layers = cfg.model.vits.n_layers + self.kernel_size = cfg.model.vits.kernel_size + self.p_dropout = cfg.model.vits.p_dropout + self.ssl_dim = cfg.model.vits.ssl_dim + self.n_flow_layer = cfg.model.vits.n_flow_layer + self.gin_channels = cfg.model.vits.gin_channels + self.n_speakers = cfg.model.vits.n_speakers + + # f0 + self.n_bins = cfg.preprocess.pitch_bin + self.f0_min = cfg.preprocess.f0_min + self.f0_max = cfg.preprocess.f0_max + + # TODO: sort out the config + self.condition_encoder = ConditionEncoder(self.cfg.model.condition_encoder) + + self.emb_g = nn.Embedding(self.n_speakers, self.gin_channels) + + self.enc_p = ContentEncoder( + self.inter_channels, + self.hidden_channels, + filter_channels=self.filter_channels, + n_heads=self.n_heads, + n_layers=self.n_layers, + kernel_size=self.kernel_size, + p_dropout=self.p_dropout, + ) + + assert cfg.model.generator in [ + "bigvgan", + "hifigan", + "melgan", + "nsfhifigan", + "apnet", + ] + self.dec_name = cfg.model.generator + temp_cfg = copy.deepcopy(cfg) + temp_cfg.preprocess.n_mel = self.inter_channels + if cfg.model.generator == "bigvgan": + temp_cfg.model.bigvgan = cfg.model.generator_config.bigvgan + self.dec = BigVGAN(temp_cfg) + elif cfg.model.generator == "hifigan": + temp_cfg.model.hifigan = cfg.model.generator_config.hifigan + self.dec = HiFiGAN(temp_cfg) + elif cfg.model.generator == "melgan": + temp_cfg.model.melgan = cfg.model.generator_config.melgan + self.dec = MelGAN(temp_cfg) + elif cfg.model.generator == "nsfhifigan": + temp_cfg.model.nsfhifigan = cfg.model.generator_config.nsfhifigan + self.dec = NSFHiFiGAN(temp_cfg) # TODO: nsf need f0 + elif cfg.model.generator == "apnet": + temp_cfg.model.apnet = cfg.model.generator_config.apnet + self.dec = APNet(temp_cfg) + + self.enc_q = PosteriorEncoder( + self.spec_channels, + self.inter_channels, + self.hidden_channels, + 5, + 1, + 16, + gin_channels=self.gin_channels, + ) + + self.flow = ResidualCouplingBlock( + self.inter_channels, + self.hidden_channels, + 5, + 1, + self.n_flow_layer, + gin_channels=self.gin_channels, + ) + + def forward(self, data): + """VitsSVC forward function. + + Args: + data (dict): condition data & audio data, including: + B: batch size, T: target length + { + "spk_id": [B, singer_table_size] + "target_len": [B] + "mask": [B, T, 1] + "mel": [B, T, n_mel] + "linear": [B, T, n_fft // 2 + 1] + "frame_pitch": [B, T] + "frame_uv": [B, T] + "audio": [B, audio_len] + "audio_len": [B] + "contentvec_feat": [B, T, contentvec_dim] + "whisper_feat": [B, T, whisper_dim] + ... + } + """ + + # TODO: elegantly handle the dimensions + if "contentvec_feat" in data.keys(): + c = data["contentvec_feat"].transpose(1, 2) + elif "whisper_feat" in data.keys(): + c = data["whisper_feat"].transpose(1, 2) + elif "mert_feat" in data.keys(): + c = data["mert_feat"].transpose(1, 2) + elif "wenet_feat" in data.keys(): + c = data["wenet_feat"].transpose(1, 2) + elif "hubert_feat" in data.keys(): + c = data["hubert_feat"].transpose(1, 2) + + spec = data["linear"].transpose(1, 2) + + if self.cfg.model.condition_encoder.use_spkid: + g = data["spk_id"] + g = self.emb_g(g).transpose(1, 2) + elif self.cfg.model.condition_encoder.use_spkemb: + g = data["spkemb"].unsqueeze(-1) + + c_lengths = data["target_len"] + spec_lengths = data["target_len"] + f0 = data["frame_pitch"] + + x_mask = torch.unsqueeze(sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype) + # condition_encoder ver. + x = self.condition_encoder(data).transpose(1, 2) + + # prior encoder + z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask) + # posterior encoder + z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g) + + # flow + z_p = self.flow(z, spec_mask, g=g) + z_slice, pitch_slice, ids_slice = rand_slice_segments_with_pitch( + z, f0, spec_lengths, self.segment_size + ) + + if self.dec_name == "nsfhifigan": + o = self.dec(z_slice, f0=f0.float()) + elif self.dec_name == "apnet": + _, _, _, _, o = self.dec(z_slice) + else: + o = self.dec(z_slice) + + outputs = { + "y_hat": o, + "ids_slice": ids_slice, + "x_mask": x_mask, + "z_mask": data["mask"].transpose(1, 2), + "z": z, + "z_p": z_p, + "m_p": m_p, + "logs_p": logs_p, + "m_q": m_q, + "logs_q": logs_q, + } + return outputs + + @torch.no_grad() + def infer(self, data, noise_scale=0.35, seed=52468): + # c, f0, uv, g + if "contentvec_feat" in data.keys(): + c = data["contentvec_feat"].transpose(1, 2) + elif "whisper_feat" in data.keys(): + c = data["whisper_feat"].transpose(1, 2) + elif "mert_feat" in data.keys(): + c = data["mert_feat"].transpose(1, 2) + elif "wenet_feat" in data.keys(): + c = data["wenet_feat"].transpose(1, 2) + elif "hubert_feat" in data.keys(): + c = data["hubert_feat"].transpose(1, 2) + + f0 = data["frame_pitch"] + if self.cfg.model.condition_encoder.use_spkid: + g = data["spk_id"] + if g.dim() == 1: + g = g.unsqueeze(0) + g = self.emb_g(g).transpose(1, 2) + elif self.cfg.model.condition_encoder.use_spkemb: + g = data["spkemb"].unsqueeze(-1) + + if c.device == torch.device("cuda"): + torch.cuda.manual_seed_all(seed) + else: + torch.manual_seed(seed) + + c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) + + x_mask = torch.unsqueeze(sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype) + # condition_encoder ver. + x = self.condition_encoder(data).transpose(1, 2) + + z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, noice_scale=noise_scale) + z = self.flow(z_p, c_mask, g=g, reverse=True) + + if self.dec_name == "nsfhifigan": + o = self.dec(z * c_mask, f0=f0.float()) + elif self.dec_name == "apnet": + _, _, _, _, o = self.dec(z * c_mask) + else: + o = self.dec(z * c_mask) + return o, f0 diff --git a/models/vc/vits/vits_inference.py b/models/vc/vits/vits_inference.py new file mode 100644 index 00000000..7c2c0253 --- /dev/null +++ b/models/vc/vits/vits_inference.py @@ -0,0 +1,84 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import json +import os +import time +import numpy as np +from tqdm import tqdm +import torch + +from models.vc.base import VCInference +from models.vc.vits.vits import SynthesizerTrn + +from models.vc.base.vc_dataset import VCTestDataset, VCTestCollator +from utils.io import save_audio +from utils.audio_slicer import is_silence + + +class VitsInference(VCInference): + def __init__(self, args=None, cfg=None, infer_type="from_dataset"): + VCInference.__init__(self, args, cfg) + + def _build_model(self): + net_g = SynthesizerTrn( + self.cfg.preprocess.n_fft // 2 + 1, + self.cfg.preprocess.segment_size // self.cfg.preprocess.hop_size, + self.cfg, + ) + self.model = net_g + return net_g + + def build_save_dir(self, dataset, speaker): + save_dir = os.path.join( + self.args.output_dir, + "vc_am_step-{}_{}".format(self.am_restore_step, self.args.mode), + ) + if dataset is not None: + save_dir = os.path.join(save_dir, "data_{}".format(dataset)) + if speaker != -1: + save_dir = os.path.join( + save_dir, + "spk_{}".format(speaker), + ) + os.makedirs(save_dir, exist_ok=True) + print("Saving to ", save_dir) + return save_dir + + @torch.inference_mode() + def inference(self): + res = [] + for i, batch in enumerate(self.test_dataloader): + pred_audio_list = self._inference_each_batch(batch) + for it, wav in zip(self.test_dataset.metadata, pred_audio_list): + uid = it["Uid"] + file = os.path.join(self.args.output_dir, f"{uid}.wav") + + wav = wav.numpy(force=True) + save_audio( + file, + wav, + self.cfg.preprocess.sample_rate, + add_silence=False, + turn_up=not is_silence(wav, self.cfg.preprocess.sample_rate), + ) + res.append(file) + return res + + def _inference_each_batch(self, batch_data, noise_scale=0.667): + device = self.accelerator.device + pred_res = [] + self.model.eval() + with torch.no_grad(): + # Put the data to device + # device = self.accelerator.device + for k, v in batch_data.items(): + batch_data[k] = v.to(device) + + audios, f0 = self.model.infer(batch_data, noise_scale=noise_scale) + + pred_res.extend(audios) + + return pred_res diff --git a/models/vc/vits/vits_trainer.py b/models/vc/vits/vits_trainer.py new file mode 100644 index 00000000..132cd189 --- /dev/null +++ b/models/vc/vits/vits_trainer.py @@ -0,0 +1,564 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from torch.optim.lr_scheduler import ExponentialLR +from tqdm import tqdm +from pathlib import Path + +import accelerate + +# from models.svc.base import SVCTrainer +from models.vc.base.vc_dataset import VCCollator, VCDataset +from models.vc.vits.vits import * +from models.tts.base import TTSTrainer + +from utils.mel import mel_spectrogram_torch +import json + +from models.vocoders.gan.discriminator.mpd import ( + MultiPeriodDiscriminator_vits as MultiPeriodDiscriminator, +) + + +class VitsVCTrainer(TTSTrainer): + def __init__(self, args, cfg): + self.args = args + self.cfg = cfg + self._init_accelerator() + + TTSTrainer.__init__(self, args, cfg) + + def _build_model(self): + net_g = SynthesizerTrn( + self.cfg.preprocess.n_fft // 2 + 1, + self.cfg.preprocess.segment_size // self.cfg.preprocess.hop_size, + # directly use cfg + self.cfg, + ) + net_d = MultiPeriodDiscriminator(self.cfg.model.vits.use_spectral_norm) + model = {"generator": net_g, "discriminator": net_d} + + return model + + def _build_dataset(self): + return VCDataset, VCCollator + + def _build_optimizer(self): + optimizer_g = torch.optim.AdamW( + self.model["generator"].parameters(), + self.cfg.train.learning_rate, + betas=self.cfg.train.AdamW.betas, + eps=self.cfg.train.AdamW.eps, + ) + optimizer_d = torch.optim.AdamW( + self.model["discriminator"].parameters(), + self.cfg.train.learning_rate, + betas=self.cfg.train.AdamW.betas, + eps=self.cfg.train.AdamW.eps, + ) + optimizer = {"optimizer_g": optimizer_g, "optimizer_d": optimizer_d} + + return optimizer + + def _build_scheduler(self): + scheduler_g = ExponentialLR( + self.optimizer["optimizer_g"], + gamma=self.cfg.train.lr_decay, + last_epoch=self.epoch - 1, + ) + scheduler_d = ExponentialLR( + self.optimizer["optimizer_d"], + gamma=self.cfg.train.lr_decay, + last_epoch=self.epoch - 1, + ) + + scheduler = {"scheduler_g": scheduler_g, "scheduler_d": scheduler_d} + return scheduler + + def _build_criterion(self): + class GeneratorLoss(nn.Module): + def __init__(self, cfg): + super(GeneratorLoss, self).__init__() + self.cfg = cfg + self.l1_loss = nn.L1Loss() + + def generator_loss(self, disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + dg = dg.float() + l = torch.mean((1 - dg) ** 2) + gen_losses.append(l) + loss += l + + return loss, gen_losses + + def feature_loss(self, fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + rl = rl.float().detach() + gl = gl.float() + loss += torch.mean(torch.abs(rl - gl)) + + return loss * 2 + + def kl_loss(self, z_p, logs_q, m_p, logs_p, z_mask): + """ + z_p, logs_q: [b, h, t_t] + m_p, logs_p: [b, h, t_t] + """ + z_p = z_p.float() + logs_q = logs_q.float() + m_p = m_p.float() + logs_p = logs_p.float() + z_mask = z_mask.float() + + kl = logs_p - logs_q - 0.5 + kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) + kl = torch.sum(kl * z_mask) + l = kl / torch.sum(z_mask) + return l + + def forward( + self, + outputs_g, + outputs_d, + y_mel, + y_hat_mel, + ): + loss_g = {} + + # mel loss + loss_mel = self.l1_loss(y_mel, y_hat_mel) * self.cfg.train.c_mel + loss_g["loss_mel"] = loss_mel + + # kl loss + loss_kl = ( + self.kl_loss( + outputs_g["z_p"], + outputs_g["logs_q"], + outputs_g["m_p"], + outputs_g["logs_p"], + outputs_g["z_mask"], + ) + * self.cfg.train.c_kl + ) + loss_g["loss_kl"] = loss_kl + + # feature loss + loss_fm = self.feature_loss(outputs_d["fmap_rs"], outputs_d["fmap_gs"]) + loss_g["loss_fm"] = loss_fm + + # gan loss + loss_gen, losses_gen = self.generator_loss(outputs_d["y_d_hat_g"]) + loss_g["loss_gen"] = loss_gen + loss_g["loss_gen_all"] = loss_mel + loss_kl + loss_fm + loss_gen + + return loss_g + + class DiscriminatorLoss(nn.Module): + def __init__(self, cfg): + super(DiscriminatorLoss, self).__init__() + self.cfg = cfg + self.l1Loss = torch.nn.L1Loss(reduction="mean") + + def __call__(self, disc_real_outputs, disc_generated_outputs): + loss_d = {} + + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + dr = dr.float() + dg = dg.float() + r_loss = torch.mean((1 - dr) ** 2) + g_loss = torch.mean(dg**2) + loss += r_loss + g_loss + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + loss_d["loss_disc_all"] = loss + + return loss_d + + criterion = { + "generator": GeneratorLoss(self.cfg), + "discriminator": DiscriminatorLoss(self.cfg), + } + return criterion + + def _check_resume(self): + if self.args.resume: + if self.args.resume_from_ckpt_path == "": + ## Automatically resume according to the current exprimental name + self.logger.info( + "Automatically resuming from latest checkpoint in {}...".format( + self.checkpoint_dir + ) + ) + start = time.monotonic_ns() + ckpt_path = self.__load_model( + checkpoint_dir=self.checkpoint_dir, + resume_type=self.args.resume_type, + ) + end = time.monotonic_ns() + self.logger.info( + f"Resuming from checkpoint done in {(end - start) / 1e6:.2f}ms" + ) + self.checkpoints_path = json.load( + open(os.path.join(ckpt_path, "ckpts.json"), "r") + ) + else: + ## Resume from the given checkpoint path + if not os.path.exists(self.args.resume_from_ckpt_path): + raise ValueError( + "[Error] The resumed checkpoint path {} don't exist.".format( + self.args.resume_from_ckpt_path + ) + ) + self.logger.info( + "Resuming from {}...".format(self.args.resume_from_ckpt_path) + ) + start = time.monotonic_ns() + ckpt_path = self.__load_model( + checkpoint_path=self.args.resume_from_ckpt_path, + resume_type=self.args.resume_type, + ) + end = time.monotonic_ns() + self.logger.info( + f"Resuming from checkpoint done in {(end - start) / 1e6:.2f}ms" + ) + + def __load_model( + self, + checkpoint_dir: str = None, + checkpoint_path: str = None, + resume_type: str = "", + ): + r"""Load model from checkpoint. If checkpoint_path is None, it will + load the latest checkpoint in checkpoint_dir. If checkpoint_path is not + None, it will load the checkpoint specified by checkpoint_path. **Only use this + method after** ``accelerator.prepare()``. + """ + if checkpoint_path is None: + ls = [str(i) for i in Path(checkpoint_dir).glob("*")] + ls.sort(key=lambda x: int(x.split("_")[-3].split("-")[-1]), reverse=True) + checkpoint_path = ls[0] + self.logger.info("Resume from {}...".format(checkpoint_path)) + + if resume_type in ["resume", ""]: + # Load all the things, including model weights, optimizer, scheduler, and random states. + self.accelerator.load_state(input_dir=checkpoint_path) + + # set epoch and step + self.epoch = int(checkpoint_path.split("_")[-3].split("-")[-1]) + 1 + self.step = int(checkpoint_path.split("_")[-2].split("-")[-1]) + 1 + + elif resume_type == "finetune": + # Load only the model weights + accelerate.load_checkpoint_and_dispatch( + self.accelerator.unwrap_model(self.model), + os.path.join(checkpoint_path, "pytorch_model.bin"), + ) + self.logger.info("Load model weights for finetune...") + + else: + raise ValueError("Resume_type must be `resume` or `finetune`.") + + return checkpoint_path + + # Keep legacy unchanged + def write_summary( + self, + losses, + stats, + images={}, + audios={}, + audio_sampling_rate=24000, + tag="train", + ): + for key, value in losses.items(): + self.sw.add_scalar(tag + "/" + key, value, self.step) + self.sw.add_scalar( + "learning_rate", + self.optimizer["optimizer_g"].param_groups[0]["lr"], + self.step, + ) + + if len(images) != 0: + for key, value in images.items(): + self.sw.add_image(key, value, self.global_step, batchformats="HWC") + if len(audios) != 0: + for key, value in audios.items(): + self.sw.add_audio(key, value, self.global_step, audio_sampling_rate) + + def write_valid_summary( + self, losses, stats, images={}, audios={}, audio_sampling_rate=24000, tag="val" + ): + for key, value in losses.items(): + self.sw.add_scalar(tag + "/" + key, value, self.step) + + if len(images) != 0: + for key, value in images.items(): + self.sw.add_image(key, value, self.global_step, batchformats="HWC") + if len(audios) != 0: + for key, value in audios.items(): + self.sw.add_audio(key, value, self.global_step, audio_sampling_rate) + + def _get_state_dict(self): + state_dict = { + "generator": self.model["generator"].state_dict(), + "discriminator": self.model["discriminator"].state_dict(), + "optimizer_g": self.optimizer["optimizer_g"].state_dict(), + "optimizer_d": self.optimizer["optimizer_d"].state_dict(), + "scheduler_g": self.scheduler["scheduler_g"].state_dict(), + "scheduler_d": self.scheduler["scheduler_d"].state_dict(), + "step": self.step, + "epoch": self.epoch, + "batch_size": self.cfg.train.batch_size, + } + return state_dict + + def get_state_dict(self): + state_dict = { + "generator": self.model["generator"].state_dict(), + "discriminator": self.model["discriminator"].state_dict(), + "optimizer_g": self.optimizer["optimizer_g"].state_dict(), + "optimizer_d": self.optimizer["optimizer_d"].state_dict(), + "scheduler_g": self.scheduler["scheduler_g"].state_dict(), + "scheduler_d": self.scheduler["scheduler_d"].state_dict(), + "step": self.step, + "epoch": self.epoch, + "batch_size": self.cfg.train.batch_size, + } + return state_dict + + def load_model(self, checkpoint): + self.step = checkpoint["step"] + self.epoch = checkpoint["epoch"] + self.model["generator"].load_state_dict(checkpoint["generator"]) + self.model["discriminator"].load_state_dict(checkpoint["discriminator"]) + self.optimizer["optimizer_g"].load_state_dict(checkpoint["optimizer_g"]) + self.optimizer["optimizer_d"].load_state_dict(checkpoint["optimizer_d"]) + self.scheduler["scheduler_g"].load_state_dict(checkpoint["scheduler_g"]) + self.scheduler["scheduler_d"].load_state_dict(checkpoint["scheduler_d"]) + + @torch.inference_mode() + def _valid_step(self, batch): + r"""Testing forward step. Should return average loss of a sample over + one batch. Provoke ``_forward_step`` is recommended except for special case. + See ``_test_epoch`` for usage. + """ + + valid_losses = {} + total_loss = 0 + valid_stats = {} + + # Discriminator + # Generator output + outputs_g = self.model["generator"](batch) + + y_mel = slice_segments( + batch["mel"].transpose(1, 2), + outputs_g["ids_slice"], + self.cfg.preprocess.segment_size // self.cfg.preprocess.hop_size, + ) + y_hat_mel = mel_spectrogram_torch( + outputs_g["y_hat"].squeeze(1), self.cfg.preprocess + ) + y = slice_segments( + batch["audio"].unsqueeze(1), + outputs_g["ids_slice"] * self.cfg.preprocess.hop_size, + self.cfg.preprocess.segment_size, + ) + + # Discriminator output + outputs_d = self.model["discriminator"](y, outputs_g["y_hat"].detach()) + ## Discriminator loss + loss_d = self.criterion["discriminator"]( + outputs_d["y_d_hat_r"], outputs_d["y_d_hat_g"] + ) + valid_losses.update(loss_d) + + ## Generator + outputs_d = self.model["discriminator"](y, outputs_g["y_hat"]) + loss_g = self.criterion["generator"](outputs_g, outputs_d, y_mel, y_hat_mel) + valid_losses.update(loss_g) + + for item in valid_losses: + valid_losses[item] = valid_losses[item].item() + + total_loss = loss_g["loss_gen_all"] + loss_d["loss_disc_all"] + + return ( + total_loss.item(), + valid_losses, + valid_stats, + ) + + def _train_step(self, batch): + r"""Forward step for training and inference. This function is called + in ``_train_step`` & ``_test_step`` function. + """ + + train_losses = {} + total_loss = 0 + training_stats = {} + + ## Train Discriminator + # Generator output + outputs_g = self.model["generator"](batch) + + y_mel = slice_segments( + batch["mel"].transpose(1, 2), + outputs_g["ids_slice"], + self.cfg.preprocess.segment_size // self.cfg.preprocess.hop_size, + ) + y_hat_mel = mel_spectrogram_torch( + outputs_g["y_hat"].squeeze(1), self.cfg.preprocess + ) + + y = slice_segments( + # [1, 168418] -> [1, 1, 168418] + batch["audio"].unsqueeze(1), + outputs_g["ids_slice"] * self.cfg.preprocess.hop_size, + self.cfg.preprocess.segment_size, + ) + + # Discriminator output + outputs_d = self.model["discriminator"](y, outputs_g["y_hat"].detach()) + # Discriminator loss + loss_d = self.criterion["discriminator"]( + outputs_d["y_d_hat_r"], outputs_d["y_d_hat_g"] + ) + train_losses.update(loss_d) + + # BP and Grad Updated + self.optimizer["optimizer_d"].zero_grad() + self.accelerator.backward(loss_d["loss_disc_all"]) + self.optimizer["optimizer_d"].step() + + ## Train Generator + outputs_d = self.model["discriminator"](y, outputs_g["y_hat"]) + loss_g = self.criterion["generator"](outputs_g, outputs_d, y_mel, y_hat_mel) + train_losses.update(loss_g) + + # BP and Grad Updated + self.optimizer["optimizer_g"].zero_grad() + self.accelerator.backward(loss_g["loss_gen_all"]) + self.optimizer["optimizer_g"].step() + + for item in train_losses: + train_losses[item] = train_losses[item].item() + + total_loss = loss_g["loss_gen_all"] + loss_d["loss_disc_all"] + + return ( + total_loss.item(), + train_losses, + training_stats, + ) + + def _train_epoch(self): + r"""Training epoch. Should return average loss of a batch (sample) over + one epoch. See ``train_loop`` for usage. + """ + epoch_sum_loss: float = 0.0 + epoch_losses: dict = {} + epoch_step: int = 0 + for batch in tqdm( + self.train_dataloader, + desc=f"Training Epoch {self.epoch}", + unit="batch", + colour="GREEN", + leave=False, + dynamic_ncols=True, + smoothing=0.04, + disable=not self.accelerator.is_main_process, + ): + # Do training step and BP + with self.accelerator.accumulate(self.model): + total_loss, train_losses, training_stats = self._train_step(batch) + self.batch_count += 1 + + # Update info for each step + if self.batch_count % self.cfg.train.gradient_accumulation_step == 0: + epoch_sum_loss += total_loss + for key, value in train_losses.items(): + if key not in epoch_losses.keys(): + epoch_losses[key] = value + else: + epoch_losses[key] += value + + self.accelerator.log( + { + "Step/Generator Loss": train_losses["loss_gen_all"], + "Step/Discriminator Loss": train_losses["loss_disc_all"], + "Step/Generator Learning Rate": self.optimizer[ + "optimizer_d" + ].param_groups[0]["lr"], + "Step/Discriminator Learning Rate": self.optimizer[ + "optimizer_g" + ].param_groups[0]["lr"], + }, + step=self.step, + ) + self.step += 1 + epoch_step += 1 + + self.accelerator.wait_for_everyone() + + epoch_sum_loss = ( + epoch_sum_loss + / len(self.train_dataloader) + * self.cfg.train.gradient_accumulation_step + ) + + for key in epoch_losses.keys(): + epoch_losses[key] = ( + epoch_losses[key] + / len(self.train_dataloader) + * self.cfg.train.gradient_accumulation_step + ) + + return epoch_sum_loss, epoch_losses + + def _build_singer_lut(self): + resumed_singer_path = None + if self.args.resume_from_ckpt_path and self.args.resume_from_ckpt_path != "": + resumed_singer_path = os.path.join( + self.args.resume_from_ckpt_path, self.cfg.preprocess.spk2id + ) + if os.path.exists(os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)): + resumed_singer_path = os.path.join(self.exp_dir, self.cfg.preprocess.spk2id) + + if resumed_singer_path: + with open(resumed_singer_path, "r") as f: + singers = json.load(f) + else: + singers = dict() + + for dataset in self.cfg.dataset: + singer_lut_path = os.path.join( + self.cfg.preprocess.processed_dir, dataset, self.cfg.preprocess.spk2id + ) + with open(singer_lut_path, "r") as singer_lut_path: + singer_lut = json.load(singer_lut_path) + for singer in singer_lut.keys(): + if singer not in singers: + singers[singer] = len(singers) + + with open( + os.path.join(self.exp_dir, self.cfg.preprocess.spk2id), "w" + ) as singer_file: + json.dump(singers, singer_file, indent=4, ensure_ascii=False) + print( + "singers have been dumped to {}".format( + os.path.join(self.exp_dir, self.cfg.preprocess.spk2id) + ) + ) + return singers From d4dc30b9f84933860956e3a449ae91b7741c48ee Mon Sep 17 00:00:00 2001 From: Xueyao Zhang Date: Thu, 29 Feb 2024 17:14:35 +0800 Subject: [PATCH 07/12] Support On-the-fly Features Extraction (#145) Support on-the-fly features extraction for the large-scale data preprocessing --- modules/encoder/condition_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/encoder/condition_encoder.py b/modules/encoder/condition_encoder.py index f274e870..2bd4b67f 100644 --- a/modules/encoder/condition_encoder.py +++ b/modules/encoder/condition_encoder.py @@ -254,7 +254,7 @@ def forward(self, x): singer_info = speaker_enc_out.expand(-1, seq_len, -1) outputs.append(singer_info) - if "spkemb" in x.keys(): + if self.cfg.use_spkemb: speaker_embedding = self.speaker_project( x["spkemb"].unsqueeze(1) ) # [b, 1, 384] From 14f8acbc765596dc6fe50700632801d8467f406c Mon Sep 17 00:00:00 2001 From: yuantuo666 Date: Sun, 25 Feb 2024 01:52:44 +0800 Subject: [PATCH 08/12] Fix typos and improve clarity in README --- egs/tts/VITS/README.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/egs/tts/VITS/README.md b/egs/tts/VITS/README.md index 5fed5492..8df28790 100644 --- a/egs/tts/VITS/README.md +++ b/egs/tts/VITS/README.md @@ -143,11 +143,6 @@ Here are some example scenarios to better understand how to use these arguments: ## 4. Inference -### Pre-trained Model Download - -We released a pre-trained Amphion VITS model trained on LJSpeech. So you can download the pre-trained model [here](https://huggingface.co/amphion/vits-ljspeech) and generate speech according to the following inference instruction. - - ### Configuration For inference, you need to specify the following configurations when running `run.sh`: From 6ecd7498669516e5d7c58e294e5e37d046453a96 Mon Sep 17 00:00:00 2001 From: mingyang Date: Fri, 5 Jan 2024 10:37:14 +0800 Subject: [PATCH 09/12] Add using of speaker embedding --- processors/acoustic_extractor.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/processors/acoustic_extractor.py b/processors/acoustic_extractor.py index be3d4a74..0634da6a 100644 --- a/processors/acoustic_extractor.py +++ b/processors/acoustic_extractor.py @@ -271,6 +271,12 @@ def extract_utt_acoustic_features_tts(dataset_output, cfg, utt): wav = wav[start:end].astype(np.float32) wav_torch = torch.from_numpy(wav).to(wav_torch.device) + if cfg.preprocess.extract_speaker: + voice_encoder = resemblyzer.VoiceEncoder(verbose=False) + speaker_wav = resemblyzer.preprocess_wav(wav_path) + speaker_embedding = voice_encoder.embed_utterance(speaker_wav) + save_feature(dataset_output, cfg.preprocess.speaker_dir, uid, speaker_embedding) + if cfg.preprocess.extract_linear_spec: from utils.mel import extract_linear_features From 1b2715b5702c43789f7fab096d58700aad1f5ae4 Mon Sep 17 00:00:00 2001 From: mingyang Date: Sat, 13 Jan 2024 13:02:17 +0800 Subject: [PATCH 10/12] Change input to Hubert token and normalized pitch --- processors/acoustic_extractor.py | 6 ------ processors/content_extractor.py | 33 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/processors/acoustic_extractor.py b/processors/acoustic_extractor.py index 0634da6a..be3d4a74 100644 --- a/processors/acoustic_extractor.py +++ b/processors/acoustic_extractor.py @@ -271,12 +271,6 @@ def extract_utt_acoustic_features_tts(dataset_output, cfg, utt): wav = wav[start:end].astype(np.float32) wav_torch = torch.from_numpy(wav).to(wav_torch.device) - if cfg.preprocess.extract_speaker: - voice_encoder = resemblyzer.VoiceEncoder(verbose=False) - speaker_wav = resemblyzer.preprocess_wav(wav_path) - speaker_embedding = voice_encoder.embed_utterance(speaker_wav) - save_feature(dataset_output, cfg.preprocess.speaker_dir, uid, speaker_embedding) - if cfg.preprocess.extract_linear_spec: from utils.mel import extract_linear_features diff --git a/processors/content_extractor.py b/processors/content_extractor.py index 6a5b2e36..51723d2c 100644 --- a/processors/content_extractor.py +++ b/processors/content_extractor.py @@ -711,3 +711,36 @@ def extract_utt_content_features_dataloader(cfg, metadata, num_workers): ) for index, utt in enumerate(_metadata): extractor.save_feature(utt, batch_content_features[index]) + + if cfg.preprocess.extract_hubert_feature: + feat_dir = os.path.join( + cfg.preprocess.processed_dir, dataset_name, "hubert" + ) + os.makedirs(feat_dir, exist_ok=True) + feat_files_num = len(os.listdir(feat_dir)) + if feat_files_num != len(metadata): + hubert_waveforms = LibrosaDataset( + cfg, + dataset_name, + cfg.preprocess.hubert_sample_rate, + metadata=metadata, + ) + data_loader = DataLoader( + hubert_waveforms, + num_workers=num_workers, + shuffle=False, + pin_memory=cfg.preprocess.pin_memory, + batch_size=cfg.preprocess.content_feature_batch_size, + collate_fn=collate_batch, + drop_last=False, + ) + extractor = HubertExtractor(cfg) + extractor.load_model() + for batch_idx, items in enumerate(tqdm(data_loader)): + _metadata, wavs, lens = items + + batch_content_features = extractor.extract_content_features( + wavs, + ) + for index, utt in enumerate(_metadata): + extractor.save_feature(utt, batch_content_features[index]) From e6ce21becf4ae6f6b3ef8ea76f3941d5b64acb65 Mon Sep 17 00:00:00 2001 From: mingyang Date: Wed, 17 Apr 2024 10:59:48 +0800 Subject: [PATCH 11/12] Add implement of VitsVC --- bins/vc/train.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bins/vc/train.py b/bins/vc/train.py index e0ddeaee..e8174246 100644 --- a/bins/vc/train.py +++ b/bins/vc/train.py @@ -81,9 +81,7 @@ def main(): for dataset in cfg.preprocess.data_augment: new_datasets = [ f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None, - f"{dataset}_formant_shift" - if cfg.preprocess.use_formant_shift - else None, + f"{dataset}_formant_shift" if cfg.preprocess.use_formant_shift else None, f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None, f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None, ] From 3a05aa1b60fa5d0f840204540ceeaa055e496769 Mon Sep 17 00:00:00 2001 From: mingyang Date: Wed, 17 Apr 2024 11:03:36 +0800 Subject: [PATCH 12/12] Add implement of VitsVC --- bins/vc/train.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bins/vc/train.py b/bins/vc/train.py index e8174246..0444b8e1 100644 --- a/bins/vc/train.py +++ b/bins/vc/train.py @@ -81,7 +81,11 @@ def main(): for dataset in cfg.preprocess.data_augment: new_datasets = [ f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None, - f"{dataset}_formant_shift" if cfg.preprocess.use_formant_shift else None, + ( + f"{dataset}_formant_shift" + if cfg.preprocess.use_formant_shift + else None + ), f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None, f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None, ]