This tutorial shows how to train encoding models using speech features with the LeBel assembly. Speech features extract representations from audio using speech models.
Speech features capture acoustic and linguistic information from audio stimuli using speech recognition models like Whisper or HuBERT. These features can be highly predictive of brain activity during audio-based experiments.
from encoding.assembly.assembly_loader import load_assembly
# Load the pre-packaged LeBel assembly
assembly = load_assembly("assembly_lebel_uts03.pkl")
import os
# Set up audio paths for speech model
base_audio_path = "/path/to/your/audio/files" # Replace with your audio path
for story_name in assembly.stories:
# Assuming audio files are named like: story_name.wav
audio_file_path = os.path.join(base_audio_path, f"{story_name}.wav")
# Set the audio path for this story
if hasattr(assembly, "story_data") and story_name in assembly.story_data:
assembly.story_data[story_name].audio_path = audio_file_path
print(f"Set audio path for {story_name}: {audio_file_path}")
from encoding.features.factory import FeatureExtractorFactory
extractor = FeatureExtractorFactory.create_extractor(
modality="speech",
model_name="openai/whisper-tiny", # Can be changed to other models
config={
"model_name": "openai/whisper-tiny",
"chunk_size": 0.1, # seconds between chunk starts (stride)
"context_size": 16.0, # seconds of audio per window
"layer": 3, # Layer index to extract features from
"pool": "last", # Pooling method: 'last' or 'mean'
"target_sample_rate": 16000, # Target sample rate for audio
"device": "cuda", # Can be "cuda", "cpu"
},
cache_dir="cache_speech",
)
from encoding.downsample.downsampling import Downsampler
from encoding.models.nested_cv import NestedCVModel
downsampler = Downsampler()
model = NestedCVModel(model_name="ridge_regression")
# FIR delays for hemodynamic response modeling
fir_delays = [1, 2, 3, 4]
# Trimming configuration for LeBel dataset
trimming_config = {
"train_features_start": 10,
"train_features_end": -5,
"train_targets_start": 0,
"train_targets_end": None,
"test_features_start": 50,
"test_features_end": -5,
"test_targets_start": 40,
"test_targets_end": None,
}
# No additional downsampling configuration needed
downsample_config = {}
from encoding.trainer import AbstractTrainer
trainer = AbstractTrainer(
assembly=assembly,
feature_extractors=[extractor],
downsampler=downsampler,
model=model,
fir_delays=fir_delays,
trimming_config=trimming_config,
use_train_test_split=True,
logger_backend="wandb",
wandb_project_name="lebel-speech-model",
dataset_type="lebel",
results_dir="results",
layer_idx=3, # Pass layer_idx to trainer
)
metrics = trainer.train()
print(f"Median correlation: {metrics.get('median_score', float('nan')):.4f}")
Speech features are extracted by:
modality
: "speech" - specifies the feature typemodel_name
: "openai/whisper-tiny" - speech model to usechunk_size
: 0.1 - seconds between chunk starts (stride)context_size
: 16.0 - seconds of audio per windowlayer
: 3 - which layer to extract features frompool
: "last" - pooling method ('last' or 'mean')target_sample_rate
: 16000 - target sample rate for audiodevice
: "cuda" - device to run the model oncache_dir
: "cache_speech" - directory for cachingThe speech extractor uses a sophisticated caching system:
This makes it efficient to experiment with different layers without recomputing features.