You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Here's the code I'm running. I've attempted to train ResNet50 on the CIFAR10 dataset, ensuring it runs smoothly for 200 epochs without using aihwkit, as the code is derived from: https://github.com/kuangliu/pytorch-cifar. However, when I try to convert the model to an analog model, thread blocking occurs after some steps or epochs during training, as shown in the image. Can someone help me figure out what might be causing this?
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet50
import os
import argparse
from models import *
from utils import progress_bar
from aihwkit.nn.conversion import convert_to_analog
from aihwkit.simulator.configs import InferenceRPUConfig
from aihwkit.simulator.configs.utils import BoundManagementType, WeightNoiseType
from aihwkit.simulator.presets import PresetIOParameters
from aihwkit.inference import PCMLikeNoiseModel, GlobalDriftCompensation
from aihwkit.nn.conversion import convert_to_analog
from aihwkit.optim import AnalogSGD
from aihwkit.simulator.configs import (
InferenceRPUConfig,
WeightModifierType,
WeightClipType,
WeightNoiseType,
BoundManagementType,
NoiseManagementType,
WeightClipParameter,
WeightModifierParameter,
MappingParameter,
)
def create_ideal_rpu_config(tile_size=512):
"""Create RPU Config with ideal conditions"""
rpu_config = InferenceRPUConfig(
mapping=MappingParameter(
digital_bias=True,
learn_out_scaling=True,
weight_scaling_omega=1.0,
out_scaling_columnwise=False,
weight_scaling_columnwise=True,
max_input_size=tile_size,
max_output_size=0,
),
forward=PresetIOParameters(is_perfect=True),
noise_model=PCMLikeNoiseModel(prog_noise_scale=0.0, read_noise_scale=0.0, drift_scale=0.0),
drift_compensation=None,
)
return rpu_config
def create_rpu_config(tile_size=512, dac_res=256, adc_res=256):
modifier_noise = 0.1
rpu_config = InferenceRPUConfig(
clip=WeightClipParameter(type=WeightClipType.FIXED_VALUE, fixed_value=1.0),
modifier=WeightModifierParameter(
rel_to_actual_wmax=True, type=WeightModifierType.ADD_NORMAL, std_dev=modifier_noise
),
mapping=MappingParameter(
digital_bias=True,
learn_out_scaling=True,
weight_scaling_omega=1.0,
out_scaling_columnwise=True,
weight_scaling_columnwise=True,
max_input_size=tile_size,
max_output_size=0,
),
forward=PresetIOParameters(
w_noise_type=WeightNoiseType.PCM_READ,
w_noise=0.0175,
inp_res=dac_res,
out_res=adc_res,
out_bound=10.0,
out_noise=0.04,
bound_management=BoundManagementType.ITERATIVE,
noise_management=NoiseManagementType.ABS_MAX,
),
noise_model=PCMLikeNoiseModel(),
drift_compensation=GlobalDriftCompensation(),
)
return rpu_config
parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training')
parser.add_argument('--lr', default=0.1, type=float, help='learning rate')
parser.add_argument('--resume', '-r', action='store_true',
help='resume from checkpoint')
args = parser.parse_args()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
best_acc = 0 # best test accuracy
start_epoch = 0 # start from epoch 0 or last checkpoint epoch
print('==> Preparing data..')
transform_train = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
transform_test = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
trainset = torchvision.datasets.CIFAR10(
root='./data', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(
trainset, batch_size=128, shuffle=True, num_workers=2)
testset = torchvision.datasets.CIFAR10(
root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(
testset, batch_size=100, shuffle=False, num_workers=2)
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
print('==> Building model..')
net = ResNet50()
net = net.to(device)
if device == 'cuda':
net = torch.nn.DataParallel(net)
cudnn.benchmark = True
if args.resume:
print('==> Resuming from checkpoint..')
assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'
checkpoint = torch.load('./checkpoint/ckpt.pth')
net.load_state_dict(checkpoint['net'])
best_acc = checkpoint['acc']
start_epoch = checkpoint['epoch']
RPU_CONFIG = create_ideal_rpu_config()
net = convert_to_analog(resnet50().to(device), RPU_CONFIG)
net.remap_analog_weights()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=args.lr,
momentum=0.9, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)
def train(epoch):
print('\nEpoch: %d' % epoch)
net.train()
train_loss = 0
correct = 0
total = 0
for batch_idx, (inputs, targets) in enumerate(trainloader):
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = net(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
train_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))
def test(epoch):
global best_acc
net.eval()
test_loss = 0
correct = 0
total = 0
with torch.no_grad():
for batch_idx, (inputs, targets) in enumerate(testloader):
inputs, targets = inputs.to(device), targets.to(device)
outputs = net(inputs)
loss = criterion(outputs, targets)
test_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
% (test_loss/(batch_idx+1), 100.*correct/total, correct, total))
acc = 100.*correct/total
if acc > best_acc:
print('Saving..')
state = {
'net': net.state_dict(),
'acc': acc,
'epoch': epoch,
}
if not os.path.isdir('checkpoint'):
os.mkdir('checkpoint')
torch.save(state, './checkpoint/ckpt.pth')
best_acc = acc
for epoch in range(start_epoch, start_epoch+200):
train(epoch)
test(epoch)
scheduler.step()
Hi @xsu0960 ,
we never saw any thread blocking issue before and it is hard to debug with the information you give here. I would suspect it is because some other tool you are using (for instance the loader or the progress bar or the submission environment). Also, you are not using the AnalogSGD which should be always used instead of the SGD to make the optimizer aware of the analog components. If you experience hanging, it would be maybe insightful if you hit CRTL-C and look at the error trace. Additionally, you should try to avoid all other tools like progress bar etc and see whether it still hangs. You might also want to examine the data loader from python, which also sometimes hangs.
reacted with thumbs up emoji reacted with thumbs down emoji reacted with laugh emoji reacted with hooray emoji reacted with confused emoji reacted with heart emoji reacted with rocket emoji reacted with eyes emoji
-
Here's the code I'm running. I've attempted to train ResNet50 on the CIFAR10 dataset, ensuring it runs smoothly for 200 epochs without using aihwkit, as the code is derived from: https://github.com/kuangliu/pytorch-cifar. However, when I try to convert the model to an analog model, thread blocking occurs after some steps or epochs during training, as shown in the image. Can someone help me figure out what might be causing this?
Beta Was this translation helpful? Give feedback.
All reactions