PyPI Downloads visitors contributions welcome PyPI - Python Version GitHub last commit GitHub code size in bytes GitHub Gitter Twitter Follow
WavEncoder is a Python library for encoding audio signals, transforms for audio augmentation, and training audio classification models with PyTorch backend.
| Layers | Models | Transforms | Trainer and utils |
|---|---|---|---|
|
|
|
|
- wav2vec [1]
- wav2vec2 [2]
- SincNet [3]
- PASE [4]
- MockingJay [5]
- RawNet [6]
- GaborNet [7]
- LEAF [8]
- CNN-1D
- CNN-LSTM
- CNN-LSTM-Attn
Check the Demo Colab Notebook.
Use the package manager pip to install wavencoder.
pip install wavencoder
import torch import wavencoder x = torch.randn(1, 16000) # [1, 16000] encoder = wavencoder.models.Wav2Vec(pretrained=True) z = encoder(x) # [1, 512, 98] classifier = wavencoder.models.LSTM_Attn_Classifier(512, 64, 2, return_attn_weights=True, attn_type='soft') y_hat, attn_weights = classifier(z) # [1, 2], [1, 98]
import torch import torch.nn as nn import wavencoder model = nn.Sequential( wavencoder.models.Wav2Vec(), wavencoder.models.LSTM_Attn_Classifier(512, 64, 2, return_attn_weights=True, attn_type='soft') ) x = torch.randn(1, 16000) # [1, 16000] y_hat, attn_weights = model(x) # [1, 2], [1, 98]
import torch import torch.nn as nn import wavencoder class AudioClassifier(nn.Module): def __init__(self): super(AudioClassifier, self).__init__() self.encoder = wavencoder.models.Wav2Vec(pretrained=True) self.classifier = nn.Linear(512, 2) def forward(self, x): z = self.encoder(x) z = torch.mean(z, dim=2) out = self.classifier(z) return out model = AudioClassifier() x = torch.randn(1, 16000) # [1, 16000] y_hat = model(x) # [1, 2]
from wavencoder.models import Wav2Vec, LSTM_Attn_Classifier from wavencoder.trainer import train, test_evaluate_classifier, test_predict_classifier model = nn.Sequential( Wav2Vec(pretrained=False), LSTM_Attn_Classifier(512, 64, 2) ) trainloader = ... valloader = ... testloader = ... trained_model, train_dict = train(model, trainloader, valloader, n_epochs=20) test_prediction_dict = test_predict_classifier(trained_model, testloader)
from wavencoder.transforms import Compose, AdditiveNoise, SpeedChange, Clipping, PadCrop, Reverberation audio, _ = torchaudio.load('test.wav') transforms = Compose([ AdditiveNoise('path-to-noise-folder', p=0.5, snr_levels=[5, 10, 15], p=0.5), SpeedChange(factor_range=(-0.5, 0.0), p=0.5), Clipping(p=0.5), PadCrop(48000, crop_position='random', pad_position='random') ]) transformed_audio = transforms(audio)
Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change.
Please make sure to update tests as appropriate.