-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added code to paper "Speaker Anonymization with Phonetic Intermediate…
… Representations"
- Loading branch information
Sarina Meyer
committed
Sep 14, 2022
1 parent
0034362
commit be1a719
Showing
30 changed files
with
1,915 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
|
||
# C extensions | ||
*.so | ||
|
||
# Distribution / packaging | ||
.Python | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
downloads/ | ||
eggs/ | ||
.eggs/ | ||
lib/ | ||
lib64/ | ||
parts/ | ||
sdist/ | ||
var/ | ||
wheels/ | ||
pip-wheel-metadata/ | ||
share/python-wheels/ | ||
*.egg-info/ | ||
.installed.cfg | ||
*.egg | ||
MANIFEST | ||
|
||
# PyInstaller | ||
# Usually these files are written by a python script from a template | ||
# before PyInstaller builds the exe, so as to inject date/other infos into it. | ||
*.manifest | ||
*.spec | ||
|
||
# Installer logs | ||
pip-log.txt | ||
pip-delete-this-directory.txt | ||
|
||
# Unit test / coverage reports | ||
htmlcov/ | ||
.tox/ | ||
.nox/ | ||
.coverage | ||
.coverage.* | ||
.cache | ||
nosetests.xml | ||
coverage.xml | ||
*.cover | ||
*.py,cover | ||
.hypothesis/ | ||
.pytest_cache/ | ||
|
||
# Translations | ||
*.mo | ||
*.pot | ||
|
||
# Django stuff: | ||
*.log | ||
local_settings.py | ||
db.sqlite3 | ||
db.sqlite3-journal | ||
|
||
# Flask stuff: | ||
instance/ | ||
.webassets-cache | ||
|
||
# Scrapy stuff: | ||
.scrapy | ||
|
||
# Sphinx documentation | ||
docs/_build/ | ||
|
||
# PyBuilder | ||
target/ | ||
|
||
# Jupyter Notebook | ||
.ipynb_checkpoints | ||
|
||
# IPython | ||
profile_default/ | ||
ipython_config.py | ||
|
||
# pyenv | ||
.python-version | ||
|
||
# pipenv | ||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. | ||
# However, in case of collaboration, if having platform-specific dependencies or dependencies | ||
# having no cross-platform support, pipenv may install dependencies that don't work, or not | ||
# install all needed dependencies. | ||
#Pipfile.lock | ||
|
||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow | ||
__pypackages__/ | ||
|
||
# Celery stuff | ||
celerybeat-schedule | ||
celerybeat.pid | ||
|
||
# SageMath parsed files | ||
*.sage.py | ||
|
||
# Environments | ||
.env | ||
.venv | ||
env/ | ||
venv/ | ||
ENV/ | ||
env.bak/ | ||
venv.bak/ | ||
|
||
# Spyder project settings | ||
.spyderproject | ||
.spyproject | ||
|
||
# Rope project settings | ||
.ropeproject | ||
|
||
# mkdocs documentation | ||
/site | ||
|
||
# mypy | ||
.mypy_cache/ | ||
.dmypy.json | ||
dmypy.json | ||
|
||
# Pyre type checker | ||
.pyre/ | ||
|
||
models/ | ||
original_speaker_embeddings/ | ||
corpora/ | ||
results/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
[submodule "Voice-Privacy-Challenge-2020"] | ||
path = Voice-Privacy-Challenge-2020 | ||
url = https://github.com/Voice-Privacy-Challenge/Voice-Privacy-Challenge-2020 | ||
[submodule "IMS-Toucan"] | ||
path = IMS-Toucan | ||
url = https://github.com/Flux9665/IMS-Toucan | ||
branch = vp_inference/1912a835c4b3de20f5190797e684f10aa45a76d9 |
Submodule IMS-Toucan
added at
1912a8
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,121 @@ | ||
# Speaker Anonymization | ||
|
||
The code, descriptions and a link to the demo will be added soon. | ||
This repository contains the speaker anonymization system developed at the Institute for Natural Language Processing | ||
(IMS) at the University of Stuttgart, Germany. The system is described in our paper [*Speaker Anonymization with | ||
Phonetic Intermediate Representations*](https://arxiv.org/abs/2207.04834) that will be | ||
published at | ||
Interspeech 2022. | ||
|
||
**In addition to the code, we are going to provide a live demo soon.** | ||
|
||
## System Description | ||
The system is based on the Voice Privacy Challenge 2020 which is included as submodule. It uses the basic idea of | ||
speaker embedding anonymization with neural synthesis, and uses the data and evaluation framework of the challenge. | ||
For a detailed description of the system, please read our paper linked above. | ||
|
||
![architecture](../speaker-anonymization/figures/architecture.png) | ||
|
||
|
||
## Installation | ||
Clone this repository with all its submodules: | ||
``` | ||
git clone --recurse-submodules https://github.com/DigitalPhonetics/speaker-anonymization.git | ||
``` | ||
|
||
In order to be able to use the framework of the Voice Privacy Challenge 2020 for evaluation, you need to install it | ||
first. According to [the challenge repository](https://github.com/Voice-Privacy-Challenge/Voice-Privacy-Challenge-2020), this should simply be | ||
``` | ||
cd Voice-Privacy-Challenge-2020 | ||
./install.sh | ||
``` | ||
However, on our systems, we had to make certain adjustments and also decided to use a more light-weight environment | ||
that minimizes unnecessary components. If you are interested, you can see our steps in | ||
[alternative_challenge_framework_installation.md](alternative_challenge_framework_installation.md). Just as a note: It is | ||
very possible that those would not directly work on your system and would need to be modified. | ||
|
||
**Note: this step will download and install Kaldi, and might lead to complications. Additionally, make sure that you | ||
are running the install script on a device with access to GPUs and CUDA.** | ||
|
||
Additionally, install the [requirements](requirements.txt) (in the base directory of this repository): | ||
``` | ||
pip install -r requirements.txt | ||
``` | ||
|
||
## Getting started | ||
Before the actual execution of our pipeline system, you first need to download and prepare the challenge data and | ||
the evaluation models. For | ||
this, you will need a password provided by the organizers of the Voice Privacy Challenge. Please contact them (see | ||
information on [their repository](https://github.com/Voice-Privacy-Challenge/Voice-Privacy-Challenge-2020) or | ||
[website](https://www.voiceprivacychallenge.org/)) for | ||
this access. | ||
|
||
You can do this by either | ||
|
||
### a) Executing our lightweight scripts: | ||
This will only download and prepare the necessary models and datasets. Note that these scripts are simply extracts | ||
of the challenge run script. | ||
``` | ||
cd setup_scripts | ||
./run_download_data.sh | ||
./run_prepare_data.sh | ||
``` | ||
|
||
or by | ||
### b) Executing the challenge run script: | ||
This will download and prepare everything necessary AND run the baseline system of the Voice Privacy Challenge 2020. | ||
Note that you will need to have installed the whole framework by the challenge install script before. | ||
``` | ||
cd Voice-Privacy-Challenge-2020/baseline | ||
./run.sh | ||
``` | ||
|
||
### Running the pipeline | ||
The system pipeline controlled in [run_inference.py](run_inference.py). You can run it via | ||
``` | ||
python run_inference.py --gpu <gpu_id> | ||
``` | ||
with <gpu_id> being the ID of the GPU the code should be executed on. If this option is not specified, it will run | ||
on CPU (not recommended). | ||
|
||
The script will anonymize the development and test data of LibriSpeech and VCTK in three steps: | ||
1. ASR: Recognition of the linguistic content, output in form of text or phone sequences | ||
2. Anonymization: Modification of speaker embeddings, output as torch vectors | ||
3. TTS: Synthesis based on recognized transcription and anonymized speaker embedding, output as audio files (wav) | ||
|
||
Each module produces intermediate results that are saved to disk. A module is only executed if previous intermediate | ||
results for dependent pipeline combination do not exist or if recomputation is forced. Otherwise, the previous | ||
results are loaded. Example: The ASR module is | ||
only executed if there are no transcriptions produced by exactly that ASR model. On the other hand, the TTS is | ||
executed if (a) the ASR was performed directly before (new transcriptions), and/or (b) the anonymization was | ||
performed directly before (new speaker embeddings), and/or (c) no TTS results exist for this combination of models. | ||
|
||
If you want to change any settings, like the particular models or datasets, you can adjust the *settings* dictionary | ||
in [run_inference.py](run_inference.py). If you want to force recomputation for a specific module, add its tag to | ||
the *force_compute* list. | ||
|
||
Immediately after the anonymization pipeline terminates, the evaluation pipeline is started. It performs some | ||
preparation steps and then executes the evaluation part of the challenge run script (this extract can be found in | ||
[evaluation/run_evaluation.sh](../speaker-anonymization/evaluation/run_evaluation.sh)). | ||
|
||
Finally, for clarity, the most important parts of the evaluation results as well as the used settings are copied to | ||
the [results](results) directory. | ||
|
||
|
||
## Models | ||
The following table lists all models for each module that are reported in the paper and are included in this | ||
repository. Each model is given by its name in the directory and the name used in the paper. In the *settings* | ||
dictionary in [run_inference.py](run_inference.py), the model name should be used. The *x* for default names the | ||
models that are used in the main configuration of the system. | ||
|
||
| Module | Default| Model name | Name in paper| | ||
|--------|--------|------------|--------------| | ||
| ASR | x | asr_tts-phn_en.zip | phones | | ||
| | | asr_stt_en | STT | | ||
| | | asr_tts_en.zip | TTS | | ||
| Anonymization | x | pool_minmax_ecapa+xvector | pool | | ||
| | | pool_raw_ecapa+xvector | pool raw | | ||
| | | random_in-scale_ecapa+xvector | random | | ||
| TTS | x | trained_on_ground_truth_phonemes.pt| Libri100| | ||
| | | trained_on_asr_phoneme_outputs.pt | Libri100 + finetuned | | ||
| | | trained_on_libri600_asr_phoneme_outputs.pt | Libri600 | | ||
| | | trained_on_libri600_ground_truth_phonemes.pt | Libri600 + finetuned | |
Submodule Voice-Privacy-Challenge-2020
added at
f58ef1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
# Alternative Installation of the Framework for the Voice Privacy Challenge 2020 | ||
Unfortunately, the installation is not always as easy as the organizers imply in their [install | ||
script](Voice-Privacy-Challenge-2020/install.sh), and installs several tools that are only necessary if the primary | ||
baseline of the challenge should be executed. To adapt the script to our devices and pipeline, we shortened and | ||
modified it, and exchanged some components. | ||
|
||
**Note: To run the code in this repository, it is NOT necessary to use the installation steps described in this | ||
document. Instead, you can also simply use the original [install | ||
script](Voice-Privacy-Challenge-2020/install.sh). If you use this document, be aware that you probably have to | ||
modify several steps to make it work for you.** | ||
|
||
## Installation Steps | ||
This guide expects that you cloned the repository included submodules. Once you followed the installation steps | ||
described in the following, continue with the *Getting started* section in the [main README](README.md). | ||
|
||
### 1. Environment creation | ||
The original installation script would create a conda environment but conda would include many packages that are not | ||
always needed. We therefore 'manually' create a virtual environment within the | ||
repository: | ||
``` | ||
virtualenv venv --python=python3.8 | ||
source venv/bin/activate | ||
pip install -r Voice-Privacy-Challenge-2020/requirements.txt | ||
``` | ||
Instead of the last line, if you want to install all requirements for the whole repository, you can instead run | ||
``` | ||
pip install -r requirements.txt | ||
``` | ||
(If this does not work, install the requirements files listed in it separately) | ||
|
||
Finally, we have to make the install script skip the step of creating an environment by creating the required check | ||
file: | ||
``` | ||
touch Voice-Privacy-Challenge-2020/.done-venv | ||
``` | ||
|
||
### 2. Adapting Kaldi | ||
The version of Kaldi in the framework is not up to date, and even the up to date one does not officially support our | ||
gcc version. We have to change that: | ||
``` | ||
cd Voice-Privacy-Challenge-2020/kaldi | ||
git checkout master | ||
vim src/configure | ||
``` | ||
In src/configure, change the min supported gcc version: | ||
``` | ||
- MIN_UNSUPPORTED_GCC_VER="10.0" | ||
- MIN_UNSUPPORTED_GCC_VER_NUM=100000; | ||
+ MIN_UNSUPPORTED_GCC_VER="12.0" | ||
+ MIN_UNSUPPORTED_GCC_VER_NUM=120000; | ||
``` | ||
|
||
### 3. CUDA and MKL | ||
Due to several installed versions of CUDA and MKL, and very specific requirements of Kaldi, we have to specify the | ||
paths to them in the [setup_scripts/install_challenge_framework.sh](../speaker-anonymization/setup_scripts/install_challenge_framework.sh) file. | ||
|
||
### 4. Installation | ||
Once everything above is resolved, you simply have to run the adapted install script: | ||
``` | ||
cd setup_scripts | ||
./install_challenge_framework.sh | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from .pool_anonymizer import PoolAnonymizer | ||
from .random_anonymizer import RandomAnonymizer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
from pathlib import Path | ||
import torch | ||
|
||
from .speaker_embeddings import SpeakerEmbeddings | ||
|
||
|
||
class BaseAnonymizer: | ||
|
||
def __init__(self, vec_type='xvector', device=None, emb_level='spk', **kwargs): | ||
# Base class for speaker embedding anonymization. | ||
self.vec_type = vec_type | ||
self.emb_level = emb_level | ||
|
||
if isinstance(device, torch.device): | ||
self.device = device | ||
elif isinstance(device, str): | ||
self.device = torch.device(device) | ||
elif isinstance(device, int): | ||
self.device = torch.device(f'cuda:{device}') | ||
else: | ||
self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') | ||
|
||
def load_parameters(self, model_dir: Path): | ||
# Template method for loading parameters special to the anonymization method. Not implemented. | ||
raise NotImplementedError('load_parameters') | ||
|
||
def save_parameters(self, model_dir: Path): | ||
# Template method for saving parameters special to the anonymization method. Not implemented. | ||
raise NotImplementedError('save_parameters') | ||
|
||
def load_embeddings(self, emb_dir: Path): | ||
# Load previously extracted or generated speaker embeddings from disk. | ||
embeddings = SpeakerEmbeddings(self.vec_type, device=self.device, emb_level=self.emb_level) | ||
embeddings.load_vectors(emb_dir) | ||
return embeddings | ||
|
||
def save_embeddings(self, embeddings, emb_dir): | ||
# Save speaker embeddings to disk. | ||
embeddings.save_vectors(emb_dir) | ||
|
||
def anonymize_data(self, data_dir: Path, vector_dir: Path, emb_level='spk'): | ||
# Template method for anonymizing a dataset. Not implemented. | ||
raise NotImplementedError('anonymize_data') | ||
|
||
def _get_speaker_embeddings(self, data_dir: Path, vector_dir: Path, emb_level='spk'): | ||
# Retrieve original speaker embeddings, either by extracting or loading them. | ||
vectors = SpeakerEmbeddings(vec_type=self.vec_type, emb_level=emb_level, device=self.device) | ||
if vector_dir.exists(): | ||
vectors.load_vectors(in_dir=vector_dir) | ||
else: | ||
vectors.extract_vectors_from_audio(data_dir=data_dir) | ||
vectors.save_vectors(out_dir=vector_dir) | ||
return vectors |
Oops, something went wrong.