This tutorial shows how to train encoding models using static word embeddings with the LeBel assembly. Static embeddings provide pre-trained word representations that can be highly predictive of brain activity.
Static embeddings capture semantic relationships between words using pre-trained models like Word2Vec or GloVe. These embeddings provide rich semantic representations that can be highly predictive of brain activity.
from encoding.assembly.assembly_loader import load_assembly
# Load the pre-packaged LeBel assembly
assembly = load_assembly("assembly_lebel_uts03.pkl")
from encoding.features.factory import FeatureExtractorFactory
# You need to provide the path to your embedding file
vector_path = "/path/to/your/embeddings.bin.gz" # Replace with your path
extractor = FeatureExtractorFactory.create_extractor(
modality="embeddings",
model_name="word2vec", # Can be "word2vec", "glove", or any identifier
config={
"vector_path": vector_path,
"binary": True, # Set to True for .bin files, False for .txt files
"lowercase": False, # Set to True if your embeddings expect lowercase tokens
"oov_handling": "copy_prev", # How to handle out-of-vocabulary words
"use_tqdm": True, # Show progress bar
},
cache_dir="cache",
)
from encoding.downsample.downsampling import Downsampler
from encoding.models.nested_cv import NestedCVModel
downsampler = Downsampler()
model = NestedCVModel(model_name="ridge_regression")
# FIR delays for hemodynamic response modeling
fir_delays = [1, 2, 3, 4]
# Trimming configuration for LeBel dataset
trimming_config = {
"train_features_start": 10,
"train_features_end": -5,
"train_targets_start": 0,
"train_targets_end": None,
"test_features_start": 50,
"test_features_end": -5,
"test_targets_start": 40,
"test_targets_end": None,
}
# No additional downsampling configuration needed
downsample_config = {}
from encoding.trainer import AbstractTrainer
trainer = AbstractTrainer(
assembly=assembly,
feature_extractors=[extractor],
downsampler=downsampler,
model=model,
fir_delays=fir_delays,
trimming_config=trimming_config,
use_train_test_split=True,
logger_backend="wandb",
wandb_project_name="lebel-embeddings",
dataset_type="lebel",
results_dir="results",
downsample_config=downsample_config,
)
metrics = trainer.train()
print(f"Median correlation: {metrics.get('median_score', float('nan')):.4f}")
binary=True
binary=False
Choose based on your research question and data characteristics.