Feature: Add knowledge distillation support #2595

Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,4 @@
	"""Knowledge Distillation module for timm"""
	from .distillation import DistillationTeacher, apply_kd_loss

	__all__ = ['DistillationTeacher', 'apply_kd_loss']

142 changes: 142 additions & 0 deletions timm/kd/distillation.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,142 @@
	"""Knowledge Distillation helpers for training with a teacher model."""
	import logging
	from typing import Tuple

	import torch
	import torch.nn as nn
	import torchvision.transforms as T

	from timm.models import create_model


	_logger = logging.getLogger(__name__)


	class DistillationTeacher(nn.Module):
	"""Wrapper for a teacher model used in knowledge distillation.

	Creates and manages a pre-trained teacher model for knowledge distillation,
	handling model compilation and normalization differences between teacher and student.

	Args:
	model_name: Name of the teacher model to create
	num_classes: Number of output classes
	in_chans: Number of input channels
	pretrained: Whether to load pretrained weights
	device: Device to place the model on (default: 'cuda')
	dtype: Model dtype (default: None, uses float32)
	"""

	def __init__(
	self,
	model_name: str,
	num_classes: int,
	in_chans: int = 3,
	device: torch.device = torch.device('cuda'),
	dtype: torch.dtype = None,
	):
	super().__init__()

	_logger.info(f"Creating KD teacher model: '{model_name}'")

	model_kd = create_model(
	model_name=model_name,
	num_classes=num_classes,
	pretrained=True,
	in_chans=in_chans,
	)

	model_kd = model_kd.to(device=device, dtype=dtype)
	model_kd.eval()

	try:
	model_kd = torch.compile(model_kd)
	_logger.info("torch.compile applied successfully to KD teacher model")
	except Exception as e:
	_logger.warning(f"torch.compile failed with error {e}, continuing without compilation")

	self.model = model_kd
	self.mean_model_kd = model_kd.pretrained_cfg['mean']
	self.std_model_kd = model_kd.pretrained_cfg['std']

	def normalize_input(
	self,
	input: torch.Tensor,
	student_model: nn.Module,
	) -> torch.Tensor:
	"""Normalize input to match teacher's expected normalization.

	Handles different normalization between teacher and student models by
	converting the student's normalized input to the teacher's expected format.

	Args:
	input: Input tensor (already normalized for student)
	student_model: Student model to extract normalization params from

	Returns:
	Input tensor normalized for the teacher model
	"""
	if hasattr(student_model, 'module'):
	model_s = student_model.module
	else:
	model_s = student_model

	mean_student = model_s.pretrained_cfg['mean']
	std_student = model_s.pretrained_cfg['std']

	input_kd = input
	if mean_student != self.mean_model_kd or std_student != self.std_model_kd:
	# Compute normalized std and mean transformations
	std = tuple(t_std / s_std for t_std, s_std in zip(self.std_model_kd, std_student))
	transform_std = T.Normalize(mean=(0, 0, 0), std=std)

	mean = tuple(t_mean - s_mean for t_mean, s_mean in zip(self.mean_model_kd, mean_student))
	transform_mean = T.Normalize(mean=mean, std=(1, 1, 1))

	input_kd = transform_mean(transform_std(input))

	return input_kd


	def apply_kd_loss(
	loss: torch.Tensor,
	student_output: torch.Tensor,
	input: torch.Tensor,
	student_model: nn.Module,
	teacher_model: DistillationTeacher,
	alpha_kd: float,
	use_kd_only: bool = False,
	) -> torch.Tensor:
	"""Apply knowledge distillation loss.

	Computes KL divergence between student and teacher outputs and combines
	with the base loss (or replaces it if use_kd_only is True).

	Args:
	loss: Base loss (e.g., cross-entropy with labels)
	student_output: Logits from student model
	input: Input tensor (already normalized for student)
	student_model: Student model being trained
	teacher_model: Teacher model for distillation
	alpha_kd: Weight for the KD loss component
	use_kd_only: If True, only use KD loss (ignore base loss)

	Returns:
	Combined loss with KD component
	"""
	# Student probability calculation
	prob_s = torch.nn.functional.log_softmax(student_output, dim=-1)

	# Teacher probability calculation
	with torch.no_grad():
	input_kd = teacher_model.normalize_input(input, student_model)
	out_t = teacher_model.model(input_kd.detach())
	prob_t = torch.nn.functional.softmax(out_t, dim=-1)

	# Compute KL divergence loss
	kd_loss = alpha_kd * torch.nn.functional.kl_div(prob_s, prob_t, reduction='batchmean')

	if use_kd_only:
	return kd_loss
	else:
	return loss + kd_loss

35 changes: 35 additions & 0 deletions train.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -41,6 +41,7 @@
		from timm.optim import create_optimizer_v2, optimizer_kwargs
		from timm.scheduler import create_scheduler_v2, scheduler_kwargs
		from timm.utils import ApexScaler, NativeScaler
	from timm.kd import DistillationTeacher, apply_kd_loss

		try:
		from apex import amp
Expand Down Expand Up		@@ -415,6 +416,14 @@
		group.add_argument('--naflex-loss-scale', default='linear', type=str,
		help='Scale loss (gradient) by batch_size ("none", "sqrt", or "linear")')

	# Knowledge Distillation parameters
	parser.add_argument('--kd-model-name', default=None, type=str,
	help='Name of teacher model for knowledge distillation')
	parser.add_argument('--alpha-kd', default=5, type=float,
	help='Weight for KD loss (default: 5)')
	parser.add_argument('--use-kd-only-loss', action='store_true', default=False,
	help='Use only KD loss, without cross-entropy loss')


		def _parse_args():
		# Do we have a config file to parse?
Expand Down Expand Up		@@ -529,6 +538,17 @@ def main():
		if args.grad_checkpointing:
		model.set_grad_checkpointing(enable=True)

	# Create the KD teacher model if specified
	model_kd = None
	if args.kd_model_name is not None:
	model_kd = DistillationTeacher(
	model_name=args.kd_model_name,
	num_classes=args.num_classes,
	in_chans=in_chans,
	device=device,
	dtype=model_dtype,
	)

		if utils.is_primary(args):
		_logger.info(
		f'Model {safe_model_name(args.model)} created, param count:{sum([m.numel() for m in model.parameters()])}')
Expand Down Expand Up		@@ -1006,6 +1026,7 @@ def main():
		mixup_fn=mixup_fn,
		num_updates_total=num_epochs * updates_per_epoch,
		naflex_mode=naflex_mode,
	model_kd=model_kd,
		)

		if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
Expand Down Expand Up		@@ -1109,6 +1130,7 @@ def train_one_epoch(
		mixup_fn=None,
		num_updates_total=None,
		naflex_mode=False,
	model_kd=None,
		):
		if args.mixup_off_epoch and epoch >= args.mixup_off_epoch:
		if args.prefetcher and loader.mixup_enabled:
Expand Down Expand Up		@@ -1155,6 +1177,19 @@ def _forward():
		with amp_autocast():
		output = model(input)
		_loss = loss_fn(output, target)

	# KD logic
	if model_kd is not None:
	_loss = apply_kd_loss(
	loss=_loss,
	student_output=output,
	input=input,
	student_model=model,
	teacher_model=model_kd,
	alpha_kd=args.alpha_kd,
	use_kd_only=args.use_kd_only_loss,
	)

		if accum_steps > 1:
		_loss /= accum_steps
		return _loss
Expand Down

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Uh oh!

Feature: Add knowledge distillation support #2595

Are you sure you want to change the base?

Feature: Add knowledge distillation support #2595

Filter by extension

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing