From 43204d193bba53a562885f23d50b748b21fc98b0 Mon Sep 17 00:00:00 2001 From: jasonkena Date: Sun, 23 Feb 2020 22:04:52 +0700 Subject: [PATCH 01/10] Formatted with Black --- backbone.py | 295 ++++-- data/coco.py | 150 +-- data/config.py | 1595 +++++++++++++++++-------------- data/scripts/mix_sets.py | 96 +- eval.py | 1018 +++++++++++++------- external/DCNv2/dcn_v2.py | 451 +++++---- external/DCNv2/setup.py | 4 +- external/DCNv2/test.py | 200 ++-- layers/box_utils.py | 189 ++-- layers/functions/__init__.py | 2 +- layers/functions/detection.py | 120 ++- layers/interpolate.py | 15 +- layers/modules/__init__.py | 2 +- layers/modules/multibox_loss.py | 467 ++++++--- layers/output_utils.py | 119 ++- run_coco_eval.py | 60 +- scripts/augment_bbox.py | 191 ++-- scripts/bbox_recall.py | 106 +- scripts/cluster_bbox_sizes.py | 69 +- scripts/compute_masks.py | 73 +- scripts/convert_darknet.py | 47 +- scripts/convert_sbd.py | 90 +- scripts/make_grid.py | 266 +++--- scripts/optimize_bboxes.py | 126 ++- scripts/parse_eval.py | 58 +- scripts/plot_loss.py | 109 ++- scripts/save_bboxes.py | 34 +- scripts/unpack_statedict.py | 8 +- train.py | 540 +++++++---- utils/__init__.py | 2 +- utils/augmentations.py | 213 +++-- utils/functions.py | 93 +- utils/logger.py | 319 ++++--- utils/nvinfo.py | 75 +- utils/timer.py | 174 ++-- web/server.py | 104 +- yolact.py | 490 ++++++---- 37 files changed, 4789 insertions(+), 3181 deletions(-) diff --git a/backbone.py b/backbone.py index 4df59d023..254168008 100644 --- a/backbone.py +++ b/backbone.py @@ -7,28 +7,60 @@ try: from dcn_v2 import DCN except ImportError: + def DCN(*args, **kwdargs): - raise Exception('DCN could not be imported. If you want to use YOLACT++ models, compile DCN. Check the README for instructions.') + raise Exception( + "DCN could not be imported. If you want to use YOLACT++ models, compile DCN. Check the README for instructions." + ) + class Bottleneck(nn.Module): """ Adapted from torchvision.models.resnet """ + expansion = 4 - def __init__(self, inplanes, planes, stride=1, downsample=None, norm_layer=nn.BatchNorm2d, dilation=1, use_dcn=False): + def __init__( + self, + inplanes, + planes, + stride=1, + downsample=None, + norm_layer=nn.BatchNorm2d, + dilation=1, + use_dcn=False, + ): super(Bottleneck, self).__init__() - self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False, dilation=dilation) + self.conv1 = nn.Conv2d( + inplanes, planes, kernel_size=1, bias=False, dilation=dilation + ) self.bn1 = norm_layer(planes) if use_dcn: - self.conv2 = DCN(planes, planes, kernel_size=3, stride=stride, - padding=dilation, dilation=dilation, deformable_groups=1) + self.conv2 = DCN( + planes, + planes, + kernel_size=3, + stride=stride, + padding=dilation, + dilation=dilation, + deformable_groups=1, + ) self.conv2.bias.data.zero_() self.conv2.conv_offset_mask.weight.data.zero_() self.conv2.conv_offset_mask.bias.data.zero_() else: - self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, - padding=dilation, bias=False, dilation=dilation) + self.conv2 = nn.Conv2d( + planes, + planes, + kernel_size=3, + stride=stride, + padding=dilation, + bias=False, + dilation=dilation, + ) self.bn2 = norm_layer(planes) - self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False, dilation=dilation) + self.conv3 = nn.Conv2d( + planes, planes * 4, kernel_size=1, bias=False, dilation=dilation + ) self.bn3 = norm_layer(planes * 4) self.relu = nn.ReLU(inplace=True) self.downsample = downsample @@ -60,7 +92,15 @@ def forward(self, x): class ResNetBackbone(nn.Module): """ Adapted from torchvision.models.resnet """ - def __init__(self, layers, dcn_layers=[0, 0, 0, 0], dcn_interval=1, atrous_layers=[], block=Bottleneck, norm_layer=nn.BatchNorm2d): + def __init__( + self, + layers, + dcn_layers=[0, 0, 0, 0], + dcn_interval=1, + atrous_layers=[], + block=Bottleneck, + norm_layer=nn.BatchNorm2d, + ): super().__init__() # These will be populated by _make_layer @@ -73,25 +113,49 @@ def __init__(self, layers, dcn_layers=[0, 0, 0, 0], dcn_interval=1, atrous_layer # From torchvision.models.resnet.Resnet self.inplanes = 64 - + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = norm_layer(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) - - self._make_layer(block, 64, layers[0], dcn_layers=dcn_layers[0], dcn_interval=dcn_interval) - self._make_layer(block, 128, layers[1], stride=2, dcn_layers=dcn_layers[1], dcn_interval=dcn_interval) - self._make_layer(block, 256, layers[2], stride=2, dcn_layers=dcn_layers[2], dcn_interval=dcn_interval) - self._make_layer(block, 512, layers[3], stride=2, dcn_layers=dcn_layers[3], dcn_interval=dcn_interval) + + self._make_layer( + block, 64, layers[0], dcn_layers=dcn_layers[0], dcn_interval=dcn_interval + ) + self._make_layer( + block, + 128, + layers[1], + stride=2, + dcn_layers=dcn_layers[1], + dcn_interval=dcn_interval, + ) + self._make_layer( + block, + 256, + layers[2], + stride=2, + dcn_layers=dcn_layers[2], + dcn_interval=dcn_interval, + ) + self._make_layer( + block, + 512, + layers[3], + stride=2, + dcn_layers=dcn_layers[3], + dcn_interval=dcn_interval, + ) # This contains every module that should be initialized by loading in pretrained weights. # Any extra layers added onto this that won't be initialized by init_backbone will not be # in this list. That way, Yolact::init_weights knows which backbone weights to initialize # with xavier, and which ones to leave alone. self.backbone_modules = [m for m in self.modules() if isinstance(m, nn.Conv2d)] - - - def _make_layer(self, block, planes, blocks, stride=1, dcn_layers=0, dcn_interval=1): + + def _make_layer( + self, block, planes, blocks, stride=1, dcn_layers=0, dcn_interval=1 + ): """ Here one layer means a string of n Bottleneck blocks. """ downsample = None @@ -101,21 +165,40 @@ def _make_layer(self, block, planes, blocks, stride=1, dcn_layers=0, dcn_interva if len(self.layers) in self.atrous_layers: self.dilation += 1 stride = 1 - + downsample = nn.Sequential( - nn.Conv2d(self.inplanes, planes * block.expansion, - kernel_size=1, stride=stride, bias=False, - dilation=self.dilation), + nn.Conv2d( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False, + dilation=self.dilation, + ), self.norm_layer(planes * block.expansion), ) layers = [] - use_dcn = (dcn_layers >= blocks) - layers.append(block(self.inplanes, planes, stride, downsample, self.norm_layer, self.dilation, use_dcn=use_dcn)) + use_dcn = dcn_layers >= blocks + layers.append( + block( + self.inplanes, + planes, + stride, + downsample, + self.norm_layer, + self.dilation, + use_dcn=use_dcn, + ) + ) self.inplanes = planes * block.expansion for i in range(1, blocks): - use_dcn = ((i+dcn_layers) >= blocks) and (i % dcn_interval == 0) - layers.append(block(self.inplanes, planes, norm_layer=self.norm_layer, use_dcn=use_dcn)) + use_dcn = ((i + dcn_layers) >= blocks) and (i % dcn_interval == 0) + layers.append( + block( + self.inplanes, planes, norm_layer=self.norm_layer, use_dcn=use_dcn + ) + ) layer = nn.Sequential(*layers) self.channels.append(planes * block.expansion) @@ -145,9 +228,9 @@ def init_backbone(self, path): # Replace layer1 -> layers.0 etc. keys = list(state_dict) for key in keys: - if key.startswith('layer'): + if key.startswith("layer"): idx = int(key[5]) - new_key = 'layers.' + str(idx-1) + key[6:] + new_key = "layers." + str(idx - 1) + key[6:] state_dict[new_key] = state_dict.pop(key) # Note: Using strict=False is berry scary. Triple check this. @@ -155,70 +238,65 @@ def init_backbone(self, path): def add_layer(self, conv_channels=1024, downsample=2, depth=1, block=Bottleneck): """ Add a downsample layer to the backbone as per what SSD does. """ - self._make_layer(block, conv_channels // block.expansion, blocks=depth, stride=downsample) - - + self._make_layer( + block, conv_channels // block.expansion, blocks=depth, stride=downsample + ) class ResNetBackboneGN(ResNetBackbone): - def __init__(self, layers, num_groups=32): super().__init__(layers, norm_layer=lambda x: nn.GroupNorm(num_groups, x)) def init_backbone(self, path): """ The path here comes from detectron. So we load it differently. """ - with open(path, 'rb') as f: - state_dict = pickle.load(f, encoding='latin1') # From the detectron source - state_dict = state_dict['blobs'] - + with open(path, "rb") as f: + state_dict = pickle.load(f, encoding="latin1") # From the detectron source + state_dict = state_dict["blobs"] + our_state_dict_keys = list(self.state_dict().keys()) new_state_dict = {} - - gn_trans = lambda x: ('gn_s' if x == 'weight' else 'gn_b') - layeridx2res = lambda x: 'res' + str(int(x)+2) - block2branch = lambda x: 'branch2' + ('a', 'b', 'c')[int(x[-1:])-1] + + gn_trans = lambda x: ("gn_s" if x == "weight" else "gn_b") + layeridx2res = lambda x: "res" + str(int(x) + 2) + block2branch = lambda x: "branch2" + ("a", "b", "c")[int(x[-1:]) - 1] # Transcribe each Detectron weights name to a Yolact weights name for key in our_state_dict_keys: - parts = key.split('.') - transcribed_key = '' + parts = key.split(".") + transcribed_key = "" - if (parts[0] == 'conv1'): - transcribed_key = 'conv1_w' - elif (parts[0] == 'bn1'): - transcribed_key = 'conv1_' + gn_trans(parts[1]) - elif (parts[0] == 'layers'): - if int(parts[1]) >= self.num_base_layers: continue + if parts[0] == "conv1": + transcribed_key = "conv1_w" + elif parts[0] == "bn1": + transcribed_key = "conv1_" + gn_trans(parts[1]) + elif parts[0] == "layers": + if int(parts[1]) >= self.num_base_layers: + continue transcribed_key = layeridx2res(parts[1]) - transcribed_key += '_' + parts[2] + '_' + transcribed_key += "_" + parts[2] + "_" - if parts[3] == 'downsample': - transcribed_key += 'branch1_' - - if parts[4] == '0': - transcribed_key += 'w' + if parts[3] == "downsample": + transcribed_key += "branch1_" + + if parts[4] == "0": + transcribed_key += "w" else: transcribed_key += gn_trans(parts[5]) else: - transcribed_key += block2branch(parts[3]) + '_' + transcribed_key += block2branch(parts[3]) + "_" - if 'conv' in parts[3]: - transcribed_key += 'w' + if "conv" in parts[3]: + transcribed_key += "w" else: transcribed_key += gn_trans(parts[4]) new_state_dict[key] = torch.Tensor(state_dict[transcribed_key]) - + # strict=False because we may have extra unitialized layers at this point self.load_state_dict(new_state_dict, strict=False) - - - - - def darknetconvlayer(in_channels, out_channels, *args, **kwdargs): """ Implements a conv, activation, then batch norm. @@ -229,9 +307,10 @@ def darknetconvlayer(in_channels, out_channels, *args, **kwdargs): nn.BatchNorm2d(out_channels), # Darknet uses 0.1 here. # See https://github.com/pjreddie/darknet/blob/680d3bde1924c8ee2d1c1dea54d3e56a05ca9a26/src/activations.h#L39 - nn.LeakyReLU(0.1, inplace=True) + nn.LeakyReLU(0.1, inplace=True), ) + class DarkNetBlock(nn.Module): """ Note: channels is the lesser of the two. The output will be expansion * channels. """ @@ -240,15 +319,15 @@ class DarkNetBlock(nn.Module): def __init__(self, in_channels, channels): super().__init__() - self.conv1 = darknetconvlayer(in_channels, channels, kernel_size=1) - self.conv2 = darknetconvlayer(channels, channels * self.expansion, kernel_size=3, padding=1) + self.conv1 = darknetconvlayer(in_channels, channels, kernel_size=1) + self.conv2 = darknetconvlayer( + channels, channels * self.expansion, kernel_size=3, padding=1 + ) def forward(self, x): return self.conv2(self.conv1(x)) + x - - class DarkNetBackbone(nn.Module): """ An implementation of YOLOv3's Darnet53 in @@ -264,12 +343,12 @@ def __init__(self, layers=[1, 2, 8, 8, 4], block=DarkNetBlock): self.num_base_layers = len(layers) self.layers = nn.ModuleList() self.channels = [] - + self._preconv = darknetconvlayer(3, 32, kernel_size=3, padding=1) self.in_channels = 32 - - self._make_layer(block, 32, layers[0]) - self._make_layer(block, 64, layers[1]) + + self._make_layer(block, 32, layers[0]) + self._make_layer(block, 64, layers[1]) self._make_layer(block, 128, layers[2]) self._make_layer(block, 256, layers[3]) self._make_layer(block, 512, layers[4]) @@ -279,15 +358,21 @@ def __init__(self, layers=[1, 2, 8, 8, 4], block=DarkNetBlock): # in this list. That way, Yolact::init_weights knows which backbone weights to initialize # with xavier, and which ones to leave alone. self.backbone_modules = [m for m in self.modules() if isinstance(m, nn.Conv2d)] - + def _make_layer(self, block, channels, num_blocks, stride=2): """ Here one layer means a string of n blocks. """ layer_list = [] # The downsample layer layer_list.append( - darknetconvlayer(self.in_channels, channels * block.expansion, - kernel_size=3, padding=1, stride=stride)) + darknetconvlayer( + self.in_channels, + channels * block.expansion, + kernel_size=3, + padding=1, + stride=stride, + ) + ) # Each block inputs channels and outputs channels * expansion self.in_channels = channels * block.expansion @@ -310,17 +395,16 @@ def forward(self, x): def add_layer(self, conv_channels=1024, stride=2, depth=1, block=DarkNetBlock): """ Add a downsample layer to the backbone as per what SSD does. """ - self._make_layer(block, conv_channels // block.expansion, num_blocks=depth, stride=stride) - + self._make_layer( + block, conv_channels // block.expansion, num_blocks=depth, stride=stride + ) + def init_backbone(self, path): """ Initializes the backbone weights for training. """ # Note: Using strict=False is berry scary. Triple check this. self.load_state_dict(torch.load(path), strict=False) - - - class VGGBackbone(nn.Module): """ Args: @@ -334,11 +418,11 @@ class VGGBackbone(nn.Module): def __init__(self, cfg, extra_args=[], norm_layers=[]): super().__init__() - + self.channels = [] self.layers = nn.ModuleList() self.in_channels = 3 - self.extra_args = list(reversed(extra_args)) # So I can use it as a stack + self.extra_args = list(reversed(extra_args)) # So I can use it as a stack # Keeps track of what the corresponding key will be in the state dict of the # pretrained model. For instance, layers.0.2 for us is 2 for the pretrained @@ -349,7 +433,9 @@ def __init__(self, cfg, extra_args=[], norm_layers=[]): for idx, layer_cfg in enumerate(cfg): self._make_layer(layer_cfg) - self.norms = nn.ModuleList([nn.BatchNorm2d(self.channels[l]) for l in norm_layers]) + self.norms = nn.ModuleList( + [nn.BatchNorm2d(self.channels[l]) for l in norm_layers] + ) self.norm_lookup = {l: idx for idx, l in enumerate(norm_layers)} # These modules will be initialized by init_backbone, @@ -373,26 +459,29 @@ def _make_layer(self, cfg): v = v[0] # v should be either M or a number - if v == 'M': + if v == "M": # Set default arguments if args is None: - args = {'kernel_size': 2, 'stride': 2} + args = {"kernel_size": 2, "stride": 2} layers.append(nn.MaxPool2d(**args)) else: # See the comment in __init__ for an explanation of this cur_layer_idx = self.total_layer_count + len(layers) - self.state_dict_lookup[cur_layer_idx] = '%d.%d' % (len(self.layers), len(layers)) + self.state_dict_lookup[cur_layer_idx] = "%d.%d" % ( + len(self.layers), + len(layers), + ) # Set default arguments if args is None: - args = {'kernel_size': 3, 'padding': 1} + args = {"kernel_size": 3, "padding": 1} # Add the layers layers.append(nn.Conv2d(self.in_channels, v, **args)) layers.append(nn.ReLU(inplace=True)) self.in_channels = v - + self.total_layer_count += len(layers) self.channels.append(self.in_channels) self.layers.append(nn.Sequential(*layers)) @@ -403,25 +492,27 @@ def forward(self, x): for idx, layer in enumerate(self.layers): x = layer(x) - + # Apply an l2norm module to the selected layers # Note that this differs from the original implemenetation if idx in self.norm_lookup: x = self.norms[self.norm_lookup[idx]](x) outs.append(x) - + return tuple(outs) def transform_key(self, k): """ Transform e.g. features.24.bias to layers.4.1.bias """ - vals = k.split('.') + vals = k.split(".") layerIdx = self.state_dict_lookup[int(vals[0])] - return 'layers.%s.%s' % (layerIdx, vals[1]) + return "layers.%s.%s" % (layerIdx, vals[1]) def init_backbone(self, path): """ Initializes the backbone weights for training. """ state_dict = torch.load(path) - state_dict = OrderedDict([(self.transform_key(k), v) for k,v in state_dict.items()]) + state_dict = OrderedDict( + [(self.transform_key(k), v) for k, v in state_dict.items()] + ) self.load_state_dict(state_dict, strict=False) @@ -429,21 +520,25 @@ def add_layer(self, conv_channels=128, downsample=2): """ Add a downsample layer to the backbone as per what SSD does. """ if len(self.extra_args) > 0: conv_channels, downsample = self.extra_args.pop() - + padding = 1 if downsample > 1 else 0 - + layer = nn.Sequential( nn.Conv2d(self.in_channels, conv_channels, kernel_size=1), nn.ReLU(inplace=True), - nn.Conv2d(conv_channels, conv_channels*2, kernel_size=3, stride=downsample, padding=padding), - nn.ReLU(inplace=True) + nn.Conv2d( + conv_channels, + conv_channels * 2, + kernel_size=3, + stride=downsample, + padding=padding, + ), + nn.ReLU(inplace=True), ) - self.in_channels = conv_channels*2 + self.in_channels = conv_channels * 2 self.channels.append(self.in_channels) self.layers.append(layer) - - def construct_backbone(cfg): diff --git a/data/coco.py b/data/coco.py index f182c55bd..75e5197c0 100644 --- a/data/coco.py +++ b/data/coco.py @@ -10,16 +10,19 @@ from pycocotools import mask as maskUtils import random + def get_label_map(): if cfg.dataset.label_map is None: - return {x+1: x+1 for x in range(len(cfg.dataset.class_names))} + return {x + 1: x + 1 for x in range(len(cfg.dataset.class_names))} else: - return cfg.dataset.label_map + return cfg.dataset.label_map + class COCOAnnotationTransform(object): """Transforms a COCO annotation into a Tensor of bbox coords and label index Initilized with a dictionary lookup of classnames to indexes """ + def __init__(self): self.label_map = get_label_map() @@ -35,12 +38,15 @@ def __call__(self, target, width, height): scale = np.array([width, height, width, height]) res = [] for obj in target: - if 'bbox' in obj: - bbox = obj['bbox'] - label_idx = obj['category_id'] + if "bbox" in obj: + bbox = obj["bbox"] + label_idx = obj["category_id"] if label_idx >= 0: label_idx = self.label_map[label_idx] - 1 - final_box = list(np.array([bbox[0], bbox[1], bbox[0]+bbox[2], bbox[1]+bbox[3]])/scale) + final_box = list( + np.array([bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]]) + / scale + ) final_box.append(label_idx) res += [final_box] # [xmin, ymin, xmax, ymax, label_idx] else: @@ -61,25 +67,31 @@ class COCODetection(data.Dataset): prep_crowds (bool): Whether or not to prepare crowds for the evaluation step. """ - def __init__(self, image_path, info_file, transform=None, - target_transform=None, - dataset_name='MS COCO', has_gt=True): + def __init__( + self, + image_path, + info_file, + transform=None, + target_transform=None, + dataset_name="MS COCO", + has_gt=True, + ): # Do this here because we have too many things named COCO from pycocotools.coco import COCO - + if target_transform is None: target_transform = COCOAnnotationTransform() self.root = image_path self.coco = COCO(info_file) - + self.ids = list(self.coco.imgToAnns.keys()) if len(self.ids) == 0 or not has_gt: self.ids = list(self.coco.imgs.keys()) - + self.transform = transform self.target_transform = COCOAnnotationTransform() - + self.name = dataset_name self.has_gt = has_gt @@ -119,30 +131,30 @@ def pull_item(self, index): # Separate out crowd annotations. These are annotations that signify a large crowd of # objects of said class, where there is no annotation for each individual object. Both # during testing and training, consider these crowds as neutral. - crowd = [x for x in target if ('iscrowd' in x and x['iscrowd'])] - target = [x for x in target if not ('iscrowd' in x and x['iscrowd'])] + crowd = [x for x in target if ("iscrowd" in x and x["iscrowd"])] + target = [x for x in target if not ("iscrowd" in x and x["iscrowd"])] num_crowds = len(crowd) for x in crowd: - x['category_id'] = -1 + x["category_id"] = -1 # This is so we ensure that all crowd annotations are at the end of the array target += crowd - + # The split here is to have compatibility with both COCO2014 and 2017 annotations. # In 2014, images have the pattern COCO_{train/val}2014_%012d.jpg, while in 2017 it's %012d.jpg. # Our script downloads the images as %012d.jpg so convert accordingly. - file_name = self.coco.loadImgs(img_id)[0]['file_name'] - - if file_name.startswith('COCO'): - file_name = file_name.split('_')[-1] + file_name = self.coco.loadImgs(img_id)[0]["file_name"] + + if file_name.startswith("COCO"): + file_name = file_name.split("_")[-1] path = osp.join(self.root, file_name) - assert osp.exists(path), 'Image path does not exist: {}'.format(path) - + assert osp.exists(path), "Image path does not exist: {}".format(path) + img = cv2.imread(path) height, width, _ = img.shape - + if len(target) > 0: # Pool all the masks for this image into one [num_objects,height,width] matrix masks = [self.coco.annToMask(obj).reshape(-1) for obj in target] @@ -155,28 +167,45 @@ def pull_item(self, index): if self.transform is not None: if len(target) > 0: target = np.array(target) - img, masks, boxes, labels = self.transform(img, masks, target[:, :4], - {'num_crowds': num_crowds, 'labels': target[:, 4]}) - + img, masks, boxes, labels = self.transform( + img, + masks, + target[:, :4], + {"num_crowds": num_crowds, "labels": target[:, 4]}, + ) + # I stored num_crowds in labels so I didn't have to modify the entirety of augmentations - num_crowds = labels['num_crowds'] - labels = labels['labels'] - + num_crowds = labels["num_crowds"] + labels = labels["labels"] + target = np.hstack((boxes, np.expand_dims(labels, axis=1))) else: - img, _, _, _ = self.transform(img, np.zeros((1, height, width), dtype=np.float), np.array([[0, 0, 1, 1]]), - {'num_crowds': 0, 'labels': np.array([0])}) + img, _, _, _ = self.transform( + img, + np.zeros((1, height, width), dtype=np.float), + np.array([[0, 0, 1, 1]]), + {"num_crowds": 0, "labels": np.array([0])}, + ) masks = None target = None if target.shape[0] == 0: - print('Warning: Augmentation output an example with no ground truth. Resampling...') - return self.pull_item(random.randint(0, len(self.ids)-1)) - - return torch.from_numpy(img).permute(2, 0, 1), target, masks, height, width, num_crowds + print( + "Warning: Augmentation output an example with no ground truth. Resampling..." + ) + return self.pull_item(random.randint(0, len(self.ids) - 1)) + + return ( + torch.from_numpy(img).permute(2, 0, 1), + target, + masks, + height, + width, + num_crowds, + ) def pull_image(self, index): - '''Returns the original image object at index in PIL form + """Returns the original image object at index in PIL form Note: not using self.__getitem__(), as any transformations passed in could mess up this functionality. @@ -185,13 +214,13 @@ def pull_image(self, index): index (int): index of img to show Return: cv2 img - ''' + """ img_id = self.ids[index] - path = self.coco.loadImgs(img_id)[0]['file_name'] + path = self.coco.loadImgs(img_id)[0]["file_name"] return cv2.imread(osp.join(self.root, path), cv2.IMREAD_COLOR) def pull_anno(self, index): - '''Returns the original annotation of image at index + """Returns the original annotation of image at index Note: not using self.__getitem__(), as any transformations passed in could mess up this functionality. @@ -201,21 +230,26 @@ def pull_anno(self, index): Return: list: [img_id, [(label, bbox coords),...]] eg: ('001718', [('dog', (96, 13, 438, 332))]) - ''' + """ img_id = self.ids[index] ann_ids = self.coco.getAnnIds(imgIds=img_id) return self.coco.loadAnns(ann_ids) def __repr__(self): - fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' - fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) - fmt_str += ' Root Location: {}\n'.format(self.root) - tmp = ' Transforms (if any): ' - fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) - tmp = ' Target Transforms (if any): ' - fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) + fmt_str = "Dataset " + self.__class__.__name__ + "\n" + fmt_str += " Number of datapoints: {}\n".format(self.__len__()) + fmt_str += " Root Location: {}\n".format(self.root) + tmp = " Transforms (if any): " + fmt_str += "{0}{1}\n".format( + tmp, self.transform.__repr__().replace("\n", "\n" + " " * len(tmp)) + ) + tmp = " Target Transforms (if any): " + fmt_str += "{0}{1}".format( + tmp, self.target_transform.__repr__().replace("\n", "\n" + " " * len(tmp)) + ) return fmt_str + def enforce_size(img, targets, masks, num_crowds, new_w, new_h): """ Ensures that the image is the given size without distorting aspect ratio. """ with torch.no_grad(): @@ -223,7 +257,7 @@ def enforce_size(img, targets, masks, num_crowds, new_w, new_h): if h == new_h and w == new_w: return img, targets, masks, num_crowds - + # Resize the image so that it fits within new_w, new_h w_prime = new_w h_prime = h * new_w / w @@ -236,25 +270,27 @@ def enforce_size(img, targets, masks, num_crowds, new_w, new_h): h_prime = int(h_prime) # Do all the resizing - img = F.interpolate(img.unsqueeze(0), (h_prime, w_prime), mode='bilinear', align_corners=False) + img = F.interpolate( + img.unsqueeze(0), (h_prime, w_prime), mode="bilinear", align_corners=False + ) img.squeeze_(0) # Act like each object is a color channel - masks = F.interpolate(masks.unsqueeze(0), (h_prime, w_prime), mode='bilinear', align_corners=False) + masks = F.interpolate( + masks.unsqueeze(0), (h_prime, w_prime), mode="bilinear", align_corners=False + ) masks.squeeze_(0) # Scale bounding boxes (this will put them in the top left corner in the case of padding) - targets[:, [0, 2]] *= (w_prime / new_w) - targets[:, [1, 3]] *= (h_prime / new_h) + targets[:, [0, 2]] *= w_prime / new_w + targets[:, [1, 3]] *= h_prime / new_h # Finally, pad everything to be the new_w, new_h pad_dims = (0, new_w - w_prime, 0, new_h - h_prime) - img = F.pad( img, pad_dims, mode='constant', value=0) - masks = F.pad(masks, pad_dims, mode='constant', value=0) + img = F.pad(img, pad_dims, mode="constant", value=0) + masks = F.pad(masks, pad_dims, mode="constant", value=0) return img, targets, masks, num_crowds - - def detection_collate(batch): diff --git a/data/config.py b/data/config.py index 91b4c82ea..b46cbb841 100644 --- a/data/config.py +++ b/data/config.py @@ -3,61 +3,203 @@ import torch # for making bounding boxes pretty -COLORS = ((244, 67, 54), - (233, 30, 99), - (156, 39, 176), - (103, 58, 183), - ( 63, 81, 181), - ( 33, 150, 243), - ( 3, 169, 244), - ( 0, 188, 212), - ( 0, 150, 136), - ( 76, 175, 80), - (139, 195, 74), - (205, 220, 57), - (255, 235, 59), - (255, 193, 7), - (255, 152, 0), - (255, 87, 34), - (121, 85, 72), - (158, 158, 158), - ( 96, 125, 139)) +COLORS = ( + (244, 67, 54), + (233, 30, 99), + (156, 39, 176), + (103, 58, 183), + (63, 81, 181), + (33, 150, 243), + (3, 169, 244), + (0, 188, 212), + (0, 150, 136), + (76, 175, 80), + (139, 195, 74), + (205, 220, 57), + (255, 235, 59), + (255, 193, 7), + (255, 152, 0), + (255, 87, 34), + (121, 85, 72), + (158, 158, 158), + (96, 125, 139), +) # These are in BGR and are for ImageNet MEANS = (103.94, 116.78, 123.68) -STD = (57.38, 57.12, 58.40) - -COCO_CLASSES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', - 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', - 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', - 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', - 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', - 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', - 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', - 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', - 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', - 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', - 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', - 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', - 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', - 'scissors', 'teddy bear', 'hair drier', 'toothbrush') - -COCO_LABEL_MAP = { 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, - 9: 9, 10: 10, 11: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, - 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, - 27: 25, 28: 26, 31: 27, 32: 28, 33: 29, 34: 30, 35: 31, 36: 32, - 37: 33, 38: 34, 39: 35, 40: 36, 41: 37, 42: 38, 43: 39, 44: 40, - 46: 41, 47: 42, 48: 43, 49: 44, 50: 45, 51: 46, 52: 47, 53: 48, - 54: 49, 55: 50, 56: 51, 57: 52, 58: 53, 59: 54, 60: 55, 61: 56, - 62: 57, 63: 58, 64: 59, 65: 60, 67: 61, 70: 62, 72: 63, 73: 64, - 74: 65, 75: 66, 76: 67, 77: 68, 78: 69, 79: 70, 80: 71, 81: 72, - 82: 73, 84: 74, 85: 75, 86: 76, 87: 77, 88: 78, 89: 79, 90: 80} - +STD = (57.38, 57.12, 58.40) + +COCO_CLASSES = ( + "person", + "bicycle", + "car", + "motorcycle", + "airplane", + "bus", + "train", + "truck", + "boat", + "traffic light", + "fire hydrant", + "stop sign", + "parking meter", + "bench", + "bird", + "cat", + "dog", + "horse", + "sheep", + "cow", + "elephant", + "bear", + "zebra", + "giraffe", + "backpack", + "umbrella", + "handbag", + "tie", + "suitcase", + "frisbee", + "skis", + "snowboard", + "sports ball", + "kite", + "baseball bat", + "baseball glove", + "skateboard", + "surfboard", + "tennis racket", + "bottle", + "wine glass", + "cup", + "fork", + "knife", + "spoon", + "bowl", + "banana", + "apple", + "sandwich", + "orange", + "broccoli", + "carrot", + "hot dog", + "pizza", + "donut", + "cake", + "chair", + "couch", + "potted plant", + "bed", + "dining table", + "toilet", + "tv", + "laptop", + "mouse", + "remote", + "keyboard", + "cell phone", + "microwave", + "oven", + "toaster", + "sink", + "refrigerator", + "book", + "clock", + "vase", + "scissors", + "teddy bear", + "hair drier", + "toothbrush", +) + +COCO_LABEL_MAP = { + 1: 1, + 2: 2, + 3: 3, + 4: 4, + 5: 5, + 6: 6, + 7: 7, + 8: 8, + 9: 9, + 10: 10, + 11: 11, + 13: 12, + 14: 13, + 15: 14, + 16: 15, + 17: 16, + 18: 17, + 19: 18, + 20: 19, + 21: 20, + 22: 21, + 23: 22, + 24: 23, + 25: 24, + 27: 25, + 28: 26, + 31: 27, + 32: 28, + 33: 29, + 34: 30, + 35: 31, + 36: 32, + 37: 33, + 38: 34, + 39: 35, + 40: 36, + 41: 37, + 42: 38, + 43: 39, + 44: 40, + 46: 41, + 47: 42, + 48: 43, + 49: 44, + 50: 45, + 51: 46, + 52: 47, + 53: 48, + 54: 49, + 55: 50, + 56: 51, + 57: 52, + 58: 53, + 59: 54, + 60: 55, + 61: 56, + 62: 57, + 63: 58, + 64: 59, + 65: 60, + 67: 61, + 70: 62, + 72: 63, + 73: 64, + 74: 65, + 75: 66, + 76: 67, + 77: 68, + 78: 69, + 79: 70, + 80: 71, + 81: 72, + 82: 73, + 84: 74, + 85: 75, + 86: 76, + 87: 77, + 88: 78, + 89: 79, + 90: 80, +} # ----------------------- CONFIG CLASS ----------------------- # + class Config(object): """ Holds the configuration for anything you want it to. @@ -78,7 +220,7 @@ def copy(self, new_config_dict={}): """ ret = Config(vars(self)) - + for key, val in new_config_dict.items(): ret.__setattr__(key, val) @@ -94,722 +236,725 @@ def replace(self, new_config_dict): for key, val in new_config_dict.items(): self.__setattr__(key, val) - + def print(self): for k, v in vars(self).items(): - print(k, ' = ', v) - - - + print(k, " = ", v) # ----------------------- DATASETS ----------------------- # -dataset_base = Config({ - 'name': 'Base Dataset', - - # Training images and annotations - 'train_images': './data/coco/images/', - 'train_info': 'path_to_annotation_file', - - # Validation images and annotations. - 'valid_images': './data/coco/images/', - 'valid_info': 'path_to_annotation_file', - - # Whether or not to load GT. If this is False, eval.py quantitative evaluation won't work. - 'has_gt': True, - - # A list of names for each of you classes. - 'class_names': COCO_CLASSES, - - # COCO class ids aren't sequential, so this is a bandage fix. If your ids aren't sequential, - # provide a map from category_id -> index in class_names + 1 (the +1 is there because it's 1-indexed). - # If not specified, this just assumes category ids start at 1 and increase sequentially. - 'label_map': None -}) - -coco2014_dataset = dataset_base.copy({ - 'name': 'COCO 2014', - - 'train_info': './data/coco/annotations/instances_train2014.json', - 'valid_info': './data/coco/annotations/instances_val2014.json', - - 'label_map': COCO_LABEL_MAP -}) - -coco2017_dataset = dataset_base.copy({ - 'name': 'COCO 2017', - - 'train_info': './data/coco/annotations/instances_train2017.json', - 'valid_info': './data/coco/annotations/instances_val2017.json', - - 'label_map': COCO_LABEL_MAP -}) - -coco2017_testdev_dataset = dataset_base.copy({ - 'name': 'COCO 2017 Test-Dev', - - 'valid_info': './data/coco/annotations/image_info_test-dev2017.json', - 'has_gt': False, - - 'label_map': COCO_LABEL_MAP -}) - -PASCAL_CLASSES = ("aeroplane", "bicycle", "bird", "boat", "bottle", - "bus", "car", "cat", "chair", "cow", "diningtable", - "dog", "horse", "motorbike", "person", "pottedplant", - "sheep", "sofa", "train", "tvmonitor") - -pascal_sbd_dataset = dataset_base.copy({ - 'name': 'Pascal SBD 2012', - - 'train_images': './data/sbd/img', - 'valid_images': './data/sbd/img', - - 'train_info': './data/sbd/pascal_sbd_train.json', - 'valid_info': './data/sbd/pascal_sbd_val.json', - - 'class_names': PASCAL_CLASSES, -}) - - - +dataset_base = Config( + { + "name": "Base Dataset", + # Training images and annotations + "train_images": "./data/coco/images/", + "train_info": "path_to_annotation_file", + # Validation images and annotations. + "valid_images": "./data/coco/images/", + "valid_info": "path_to_annotation_file", + # Whether or not to load GT. If this is False, eval.py quantitative evaluation won't work. + "has_gt": True, + # A list of names for each of you classes. + "class_names": COCO_CLASSES, + # COCO class ids aren't sequential, so this is a bandage fix. If your ids aren't sequential, + # provide a map from category_id -> index in class_names + 1 (the +1 is there because it's 1-indexed). + # If not specified, this just assumes category ids start at 1 and increase sequentially. + "label_map": None, + } +) + +coco2014_dataset = dataset_base.copy( + { + "name": "COCO 2014", + "train_info": "./data/coco/annotations/instances_train2014.json", + "valid_info": "./data/coco/annotations/instances_val2014.json", + "label_map": COCO_LABEL_MAP, + } +) + +coco2017_dataset = dataset_base.copy( + { + "name": "COCO 2017", + "train_info": "./data/coco/annotations/instances_train2017.json", + "valid_info": "./data/coco/annotations/instances_val2017.json", + "label_map": COCO_LABEL_MAP, + } +) + +coco2017_testdev_dataset = dataset_base.copy( + { + "name": "COCO 2017 Test-Dev", + "valid_info": "./data/coco/annotations/image_info_test-dev2017.json", + "has_gt": False, + "label_map": COCO_LABEL_MAP, + } +) + +PASCAL_CLASSES = ( + "aeroplane", + "bicycle", + "bird", + "boat", + "bottle", + "bus", + "car", + "cat", + "chair", + "cow", + "diningtable", + "dog", + "horse", + "motorbike", + "person", + "pottedplant", + "sheep", + "sofa", + "train", + "tvmonitor", +) + +pascal_sbd_dataset = dataset_base.copy( + { + "name": "Pascal SBD 2012", + "train_images": "./data/sbd/img", + "valid_images": "./data/sbd/img", + "train_info": "./data/sbd/pascal_sbd_train.json", + "valid_info": "./data/sbd/pascal_sbd_val.json", + "class_names": PASCAL_CLASSES, + } +) # ----------------------- TRANSFORMS ----------------------- # -resnet_transform = Config({ - 'channel_order': 'RGB', - 'normalize': True, - 'subtract_means': False, - 'to_float': False, -}) - -vgg_transform = Config({ - # Note that though vgg is traditionally BGR, - # the channel order of vgg_reducedfc.pth is RGB. - 'channel_order': 'RGB', - 'normalize': False, - 'subtract_means': True, - 'to_float': False, -}) - -darknet_transform = Config({ - 'channel_order': 'RGB', - 'normalize': False, - 'subtract_means': False, - 'to_float': True, -}) - - - +resnet_transform = Config( + { + "channel_order": "RGB", + "normalize": True, + "subtract_means": False, + "to_float": False, + } +) + +vgg_transform = Config( + { + # Note that though vgg is traditionally BGR, + # the channel order of vgg_reducedfc.pth is RGB. + "channel_order": "RGB", + "normalize": False, + "subtract_means": True, + "to_float": False, + } +) + +darknet_transform = Config( + { + "channel_order": "RGB", + "normalize": False, + "subtract_means": False, + "to_float": True, + } +) # ----------------------- BACKBONES ----------------------- # -backbone_base = Config({ - 'name': 'Base Backbone', - 'path': 'path/to/pretrained/weights', - 'type': object, - 'args': tuple(), - 'transform': resnet_transform, - - 'selected_layers': list(), - 'pred_scales': list(), - 'pred_aspect_ratios': list(), - - 'use_pixel_scales': False, - 'preapply_sqrt': True, - 'use_square_anchors': False, -}) - -resnet101_backbone = backbone_base.copy({ - 'name': 'ResNet101', - 'path': 'resnet101_reducedfc.pth', - 'type': ResNetBackbone, - 'args': ([3, 4, 23, 3],), - 'transform': resnet_transform, - - 'selected_layers': list(range(2, 8)), - 'pred_scales': [[1]]*6, - 'pred_aspect_ratios': [ [[0.66685089, 1.7073535, 0.87508774, 1.16524493, 0.49059086]] ] * 6, -}) - -resnet101_gn_backbone = backbone_base.copy({ - 'name': 'ResNet101_GN', - 'path': 'R-101-GN.pkl', - 'type': ResNetBackboneGN, - 'args': ([3, 4, 23, 3],), - 'transform': resnet_transform, - - 'selected_layers': list(range(2, 8)), - 'pred_scales': [[1]]*6, - 'pred_aspect_ratios': [ [[0.66685089, 1.7073535, 0.87508774, 1.16524493, 0.49059086]] ] * 6, -}) - -resnet101_dcn_inter3_backbone = resnet101_backbone.copy({ - 'name': 'ResNet101_DCN_Interval3', - 'args': ([3, 4, 23, 3], [0, 4, 23, 3], 3), -}) - -resnet50_backbone = resnet101_backbone.copy({ - 'name': 'ResNet50', - 'path': 'resnet50-19c8e357.pth', - 'type': ResNetBackbone, - 'args': ([3, 4, 6, 3],), - 'transform': resnet_transform, -}) - -resnet50_dcnv2_backbone = resnet50_backbone.copy({ - 'name': 'ResNet50_DCNv2', - 'args': ([3, 4, 6, 3], [0, 4, 6, 3]), -}) - -darknet53_backbone = backbone_base.copy({ - 'name': 'DarkNet53', - 'path': 'darknet53.pth', - 'type': DarkNetBackbone, - 'args': ([1, 2, 8, 8, 4],), - 'transform': darknet_transform, - - 'selected_layers': list(range(3, 9)), - 'pred_scales': [[3.5, 4.95], [3.6, 4.90], [3.3, 4.02], [2.7, 3.10], [2.1, 2.37], [1.8, 1.92]], - 'pred_aspect_ratios': [ [[1, sqrt(2), 1/sqrt(2), sqrt(3), 1/sqrt(3)][:n], [1]] for n in [3, 5, 5, 5, 3, 3] ], -}) - -vgg16_arch = [[64, 64], - [ 'M', 128, 128], - [ 'M', 256, 256, 256], - [('M', {'kernel_size': 2, 'stride': 2, 'ceil_mode': True}), 512, 512, 512], - [ 'M', 512, 512, 512], - [('M', {'kernel_size': 3, 'stride': 1, 'padding': 1}), - (1024, {'kernel_size': 3, 'padding': 6, 'dilation': 6}), - (1024, {'kernel_size': 1})]] - -vgg16_backbone = backbone_base.copy({ - 'name': 'VGG16', - 'path': 'vgg16_reducedfc.pth', - 'type': VGGBackbone, - 'args': (vgg16_arch, [(256, 2), (128, 2), (128, 1), (128, 1)], [3]), - 'transform': vgg_transform, - - 'selected_layers': [3] + list(range(5, 10)), - 'pred_scales': [[5, 4]]*6, - 'pred_aspect_ratios': [ [[1], [1, sqrt(2), 1/sqrt(2), sqrt(3), 1/sqrt(3)][:n]] for n in [3, 5, 5, 5, 3, 3] ], -}) - - - +backbone_base = Config( + { + "name": "Base Backbone", + "path": "path/to/pretrained/weights", + "type": object, + "args": tuple(), + "transform": resnet_transform, + "selected_layers": list(), + "pred_scales": list(), + "pred_aspect_ratios": list(), + "use_pixel_scales": False, + "preapply_sqrt": True, + "use_square_anchors": False, + } +) + +resnet101_backbone = backbone_base.copy( + { + "name": "ResNet101", + "path": "resnet101_reducedfc.pth", + "type": ResNetBackbone, + "args": ([3, 4, 23, 3],), + "transform": resnet_transform, + "selected_layers": list(range(2, 8)), + "pred_scales": [[1]] * 6, + "pred_aspect_ratios": [ + [[0.66685089, 1.7073535, 0.87508774, 1.16524493, 0.49059086]] + ] + * 6, + } +) + +resnet101_gn_backbone = backbone_base.copy( + { + "name": "ResNet101_GN", + "path": "R-101-GN.pkl", + "type": ResNetBackboneGN, + "args": ([3, 4, 23, 3],), + "transform": resnet_transform, + "selected_layers": list(range(2, 8)), + "pred_scales": [[1]] * 6, + "pred_aspect_ratios": [ + [[0.66685089, 1.7073535, 0.87508774, 1.16524493, 0.49059086]] + ] + * 6, + } +) + +resnet101_dcn_inter3_backbone = resnet101_backbone.copy( + {"name": "ResNet101_DCN_Interval3", "args": ([3, 4, 23, 3], [0, 4, 23, 3], 3),} +) + +resnet50_backbone = resnet101_backbone.copy( + { + "name": "ResNet50", + "path": "resnet50-19c8e357.pth", + "type": ResNetBackbone, + "args": ([3, 4, 6, 3],), + "transform": resnet_transform, + } +) + +resnet50_dcnv2_backbone = resnet50_backbone.copy( + {"name": "ResNet50_DCNv2", "args": ([3, 4, 6, 3], [0, 4, 6, 3]),} +) + +darknet53_backbone = backbone_base.copy( + { + "name": "DarkNet53", + "path": "darknet53.pth", + "type": DarkNetBackbone, + "args": ([1, 2, 8, 8, 4],), + "transform": darknet_transform, + "selected_layers": list(range(3, 9)), + "pred_scales": [ + [3.5, 4.95], + [3.6, 4.90], + [3.3, 4.02], + [2.7, 3.10], + [2.1, 2.37], + [1.8, 1.92], + ], + "pred_aspect_ratios": [ + [[1, sqrt(2), 1 / sqrt(2), sqrt(3), 1 / sqrt(3)][:n], [1]] + for n in [3, 5, 5, 5, 3, 3] + ], + } +) + +vgg16_arch = [ + [64, 64], + ["M", 128, 128], + ["M", 256, 256, 256], + [("M", {"kernel_size": 2, "stride": 2, "ceil_mode": True}), 512, 512, 512], + ["M", 512, 512, 512], + [ + ("M", {"kernel_size": 3, "stride": 1, "padding": 1}), + (1024, {"kernel_size": 3, "padding": 6, "dilation": 6}), + (1024, {"kernel_size": 1}), + ], +] + +vgg16_backbone = backbone_base.copy( + { + "name": "VGG16", + "path": "vgg16_reducedfc.pth", + "type": VGGBackbone, + "args": (vgg16_arch, [(256, 2), (128, 2), (128, 1), (128, 1)], [3]), + "transform": vgg_transform, + "selected_layers": [3] + list(range(5, 10)), + "pred_scales": [[5, 4]] * 6, + "pred_aspect_ratios": [ + [[1], [1, sqrt(2), 1 / sqrt(2), sqrt(3), 1 / sqrt(3)][:n]] + for n in [3, 5, 5, 5, 3, 3] + ], + } +) # ----------------------- MASK BRANCH TYPES ----------------------- # -mask_type = Config({ - # Direct produces masks directly as the output of each pred module. - # This is denoted as fc-mask in the paper. - # Parameters: mask_size, use_gt_bboxes - 'direct': 0, - - # Lincomb produces coefficients as the output of each pred module then uses those coefficients - # to linearly combine features from a prototype network to create image-sized masks. - # Parameters: - # - masks_to_train (int): Since we're producing (near) full image masks, it'd take too much - # vram to backprop on every single mask. Thus we select only a subset. - # - mask_proto_src (int): The input layer to the mask prototype generation network. This is an - # index in backbone.layers. Use to use the image itself instead. - # - mask_proto_net (list): A list of layers in the mask proto network with the last one - # being where the masks are taken from. Each conv layer is in - # the form (num_features, kernel_size, **kwdargs). An empty - # list means to use the source for prototype masks. If the - # kernel_size is negative, this creates a deconv layer instead. - # If the kernel_size is negative and the num_features is None, - # this creates a simple bilinear interpolation layer instead. - # - mask_proto_bias (bool): Whether to include an extra coefficient that corresponds to a proto - # mask of all ones. - # - mask_proto_prototype_activation (func): The activation to apply to each prototype mask. - # - mask_proto_mask_activation (func): After summing the prototype masks with the predicted - # coeffs, what activation to apply to the final mask. - # - mask_proto_coeff_activation (func): The activation to apply to the mask coefficients. - # - mask_proto_crop (bool): If True, crop the mask with the predicted bbox during training. - # - mask_proto_crop_expand (float): If cropping, the percent to expand the cropping bbox by - # in each direction. This is to make the model less reliant - # on perfect bbox predictions. - # - mask_proto_loss (str [l1|disj]): If not None, apply an l1 or disjunctive regularization - # loss directly to the prototype masks. - # - mask_proto_binarize_downsampled_gt (bool): Binarize GT after dowsnampling during training? - # - mask_proto_normalize_mask_loss_by_sqrt_area (bool): Whether to normalize mask loss by sqrt(sum(gt)) - # - mask_proto_reweight_mask_loss (bool): Reweight mask loss such that background is divided by - # #background and foreground is divided by #foreground. - # - mask_proto_grid_file (str): The path to the grid file to use with the next option. - # This should be a numpy.dump file with shape [numgrids, h, w] - # where h and w are w.r.t. the mask_proto_src convout. - # - mask_proto_use_grid (bool): Whether to add extra grid features to the proto_net input. - # - mask_proto_coeff_gate (bool): Add an extra set of sigmoided coefficients that is multiplied - # into the predicted coefficients in order to "gate" them. - # - mask_proto_prototypes_as_features (bool): For each prediction module, downsample the prototypes - # to the convout size of that module and supply the prototypes as input - # in addition to the already supplied backbone features. - # - mask_proto_prototypes_as_features_no_grad (bool): If the above is set, don't backprop gradients to - # to the prototypes from the network head. - # - mask_proto_remove_empty_masks (bool): Remove masks that are downsampled to 0 during loss calculations. - # - mask_proto_reweight_coeff (float): The coefficient to multiple the forground pixels with if reweighting. - # - mask_proto_coeff_diversity_loss (bool): Apply coefficient diversity loss on the coefficients so that the same - # instance has similar coefficients. - # - mask_proto_coeff_diversity_alpha (float): The weight to use for the coefficient diversity loss. - # - mask_proto_normalize_emulate_roi_pooling (bool): Normalize the mask loss to emulate roi pooling's affect on loss. - # - mask_proto_double_loss (bool): Whether to use the old loss in addition to any special new losses. - # - mask_proto_double_loss_alpha (float): The alpha to weight the above loss. - # - mask_proto_split_prototypes_by_head (bool): If true, this will give each prediction head its own prototypes. - # - mask_proto_crop_with_pred_box (bool): Whether to crop with the predicted box or the gt box. - 'lincomb': 1, -}) - - - +mask_type = Config( + { + # Direct produces masks directly as the output of each pred module. + # This is denoted as fc-mask in the paper. + # Parameters: mask_size, use_gt_bboxes + "direct": 0, + # Lincomb produces coefficients as the output of each pred module then uses those coefficients + # to linearly combine features from a prototype network to create image-sized masks. + # Parameters: + # - masks_to_train (int): Since we're producing (near) full image masks, it'd take too much + # vram to backprop on every single mask. Thus we select only a subset. + # - mask_proto_src (int): The input layer to the mask prototype generation network. This is an + # index in backbone.layers. Use to use the image itself instead. + # - mask_proto_net (list): A list of layers in the mask proto network with the last one + # being where the masks are taken from. Each conv layer is in + # the form (num_features, kernel_size, **kwdargs). An empty + # list means to use the source for prototype masks. If the + # kernel_size is negative, this creates a deconv layer instead. + # If the kernel_size is negative and the num_features is None, + # this creates a simple bilinear interpolation layer instead. + # - mask_proto_bias (bool): Whether to include an extra coefficient that corresponds to a proto + # mask of all ones. + # - mask_proto_prototype_activation (func): The activation to apply to each prototype mask. + # - mask_proto_mask_activation (func): After summing the prototype masks with the predicted + # coeffs, what activation to apply to the final mask. + # - mask_proto_coeff_activation (func): The activation to apply to the mask coefficients. + # - mask_proto_crop (bool): If True, crop the mask with the predicted bbox during training. + # - mask_proto_crop_expand (float): If cropping, the percent to expand the cropping bbox by + # in each direction. This is to make the model less reliant + # on perfect bbox predictions. + # - mask_proto_loss (str [l1|disj]): If not None, apply an l1 or disjunctive regularization + # loss directly to the prototype masks. + # - mask_proto_binarize_downsampled_gt (bool): Binarize GT after dowsnampling during training? + # - mask_proto_normalize_mask_loss_by_sqrt_area (bool): Whether to normalize mask loss by sqrt(sum(gt)) + # - mask_proto_reweight_mask_loss (bool): Reweight mask loss such that background is divided by + # #background and foreground is divided by #foreground. + # - mask_proto_grid_file (str): The path to the grid file to use with the next option. + # This should be a numpy.dump file with shape [numgrids, h, w] + # where h and w are w.r.t. the mask_proto_src convout. + # - mask_proto_use_grid (bool): Whether to add extra grid features to the proto_net input. + # - mask_proto_coeff_gate (bool): Add an extra set of sigmoided coefficients that is multiplied + # into the predicted coefficients in order to "gate" them. + # - mask_proto_prototypes_as_features (bool): For each prediction module, downsample the prototypes + # to the convout size of that module and supply the prototypes as input + # in addition to the already supplied backbone features. + # - mask_proto_prototypes_as_features_no_grad (bool): If the above is set, don't backprop gradients to + # to the prototypes from the network head. + # - mask_proto_remove_empty_masks (bool): Remove masks that are downsampled to 0 during loss calculations. + # - mask_proto_reweight_coeff (float): The coefficient to multiple the forground pixels with if reweighting. + # - mask_proto_coeff_diversity_loss (bool): Apply coefficient diversity loss on the coefficients so that the same + # instance has similar coefficients. + # - mask_proto_coeff_diversity_alpha (float): The weight to use for the coefficient diversity loss. + # - mask_proto_normalize_emulate_roi_pooling (bool): Normalize the mask loss to emulate roi pooling's affect on loss. + # - mask_proto_double_loss (bool): Whether to use the old loss in addition to any special new losses. + # - mask_proto_double_loss_alpha (float): The alpha to weight the above loss. + # - mask_proto_split_prototypes_by_head (bool): If true, this will give each prediction head its own prototypes. + # - mask_proto_crop_with_pred_box (bool): Whether to crop with the predicted box or the gt box. + "lincomb": 1, + } +) # ----------------------- ACTIVATION FUNCTIONS ----------------------- # -activation_func = Config({ - 'tanh': torch.tanh, - 'sigmoid': torch.sigmoid, - 'softmax': lambda x: torch.nn.functional.softmax(x, dim=-1), - 'relu': lambda x: torch.nn.functional.relu(x, inplace=True), - 'none': lambda x: x, -}) - - - +activation_func = Config( + { + "tanh": torch.tanh, + "sigmoid": torch.sigmoid, + "softmax": lambda x: torch.nn.functional.softmax(x, dim=-1), + "relu": lambda x: torch.nn.functional.relu(x, inplace=True), + "none": lambda x: x, + } +) # ----------------------- FPN DEFAULTS ----------------------- # -fpn_base = Config({ - # The number of features to have in each FPN layer - 'num_features': 256, - - # The upsampling mode used - 'interpolation_mode': 'bilinear', - - # The number of extra layers to be produced by downsampling starting at P5 - 'num_downsample': 1, - - # Whether to down sample with a 3x3 stride 2 conv layer instead of just a stride 2 selection - 'use_conv_downsample': False, - - # Whether to pad the pred layers with 1 on each side (I forgot to add this at the start) - # This is just here for backwards compatibility - 'pad': True, - - # Whether to add relu to the downsampled layers. - 'relu_downsample_layers': False, - - # Whether to add relu to the regular layers - 'relu_pred_layers': True, -}) - - - +fpn_base = Config( + { + # The number of features to have in each FPN layer + "num_features": 256, + # The upsampling mode used + "interpolation_mode": "bilinear", + # The number of extra layers to be produced by downsampling starting at P5 + "num_downsample": 1, + # Whether to down sample with a 3x3 stride 2 conv layer instead of just a stride 2 selection + "use_conv_downsample": False, + # Whether to pad the pred layers with 1 on each side (I forgot to add this at the start) + # This is just here for backwards compatibility + "pad": True, + # Whether to add relu to the downsampled layers. + "relu_downsample_layers": False, + # Whether to add relu to the regular layers + "relu_pred_layers": True, + } +) # ----------------------- CONFIG DEFAULTS ----------------------- # -coco_base_config = Config({ - 'dataset': coco2014_dataset, - 'num_classes': 81, # This should include the background class - - 'max_iter': 400000, - - # The maximum number of detections for evaluation - 'max_num_detections': 100, - - # dw' = momentum * dw - lr * (grad + decay * w) - 'lr': 1e-3, - 'momentum': 0.9, - 'decay': 5e-4, - - # For each lr step, what to multiply the lr with - 'gamma': 0.1, - 'lr_steps': (280000, 360000, 400000), - - # Initial learning rate to linearly warmup from (if until > 0) - 'lr_warmup_init': 1e-4, - - # If > 0 then increase the lr linearly from warmup_init to lr each iter for until iters - 'lr_warmup_until': 500, - - # The terms to scale the respective loss by - 'conf_alpha': 1, - 'bbox_alpha': 1.5, - 'mask_alpha': 0.4 / 256 * 140 * 140, # Some funky equation. Don't worry about it. - - # Eval.py sets this if you just want to run YOLACT as a detector - 'eval_mask_branch': True, - - # Top_k examples to consider for NMS - 'nms_top_k': 200, - # Examples with confidence less than this are not considered by NMS - 'nms_conf_thresh': 0.05, - # Boxes with IoU overlap greater than this threshold will be culled during NMS - 'nms_thresh': 0.5, - - # See mask_type for details. - 'mask_type': mask_type.direct, - 'mask_size': 16, - 'masks_to_train': 100, - 'mask_proto_src': None, - 'mask_proto_net': [(256, 3, {}), (256, 3, {})], - 'mask_proto_bias': False, - 'mask_proto_prototype_activation': activation_func.relu, - 'mask_proto_mask_activation': activation_func.sigmoid, - 'mask_proto_coeff_activation': activation_func.tanh, - 'mask_proto_crop': True, - 'mask_proto_crop_expand': 0, - 'mask_proto_loss': None, - 'mask_proto_binarize_downsampled_gt': True, - 'mask_proto_normalize_mask_loss_by_sqrt_area': False, - 'mask_proto_reweight_mask_loss': False, - 'mask_proto_grid_file': 'data/grid.npy', - 'mask_proto_use_grid': False, - 'mask_proto_coeff_gate': False, - 'mask_proto_prototypes_as_features': False, - 'mask_proto_prototypes_as_features_no_grad': False, - 'mask_proto_remove_empty_masks': False, - 'mask_proto_reweight_coeff': 1, - 'mask_proto_coeff_diversity_loss': False, - 'mask_proto_coeff_diversity_alpha': 1, - 'mask_proto_normalize_emulate_roi_pooling': False, - 'mask_proto_double_loss': False, - 'mask_proto_double_loss_alpha': 1, - 'mask_proto_split_prototypes_by_head': False, - 'mask_proto_crop_with_pred_box': False, - - # SSD data augmentation parameters - # Randomize hue, vibrance, etc. - 'augment_photometric_distort': True, - # Have a chance to scale down the image and pad (to emulate smaller detections) - 'augment_expand': True, - # Potentialy sample a random crop from the image and put it in a random place - 'augment_random_sample_crop': True, - # Mirror the image with a probability of 1/2 - 'augment_random_mirror': True, - # Flip the image vertically with a probability of 1/2 - 'augment_random_flip': False, - # With uniform probability, rotate the image [0,90,180,270] degrees - 'augment_random_rot90': False, - - # Discard detections with width and height smaller than this (in absolute width and height) - 'discard_box_width': 4 / 550, - 'discard_box_height': 4 / 550, - - # If using batchnorm anywhere in the backbone, freeze the batchnorm layer during training. - # Note: any additional batch norm layers after the backbone will not be frozen. - 'freeze_bn': False, - - # Set this to a config object if you want an FPN (inherit from fpn_base). See fpn_base for details. - 'fpn': None, - - # Use the same weights for each network head - 'share_prediction_module': False, - - # For hard negative mining, instead of using the negatives that are leastl confidently background, - # use negatives that are most confidently not background. - 'ohem_use_most_confident': False, - - # Use focal loss as described in https://arxiv.org/pdf/1708.02002.pdf instead of OHEM - 'use_focal_loss': False, - 'focal_loss_alpha': 0.25, - 'focal_loss_gamma': 2, - - # The initial bias toward forground objects, as specified in the focal loss paper - 'focal_loss_init_pi': 0.01, - - # Keeps track of the average number of examples for each class, and weights the loss for that class accordingly. - 'use_class_balanced_conf': False, - - # Whether to use sigmoid focal loss instead of softmax, all else being the same. - 'use_sigmoid_focal_loss': False, - - # Use class[0] to be the objectness score and class[1:] to be the softmax predicted class. - # Note: at the moment this is only implemented if use_focal_loss is on. - 'use_objectness_score': False, - - # Adds a global pool + fc layer to the smallest selected layer that predicts the existence of each of the 80 classes. - # This branch is only evaluated during training time and is just there for multitask learning. - 'use_class_existence_loss': False, - 'class_existence_alpha': 1, - - # Adds a 1x1 convolution directly to the biggest selected layer that predicts a semantic segmentations for each of the 80 classes. - # This branch is only evaluated during training time and is just there for multitask learning. - 'use_semantic_segmentation_loss': False, - 'semantic_segmentation_alpha': 1, - - # Adds another branch to the netwok to predict Mask IoU. - 'use_mask_scoring': False, - 'mask_scoring_alpha': 1, - - # Match gt boxes using the Box2Pix change metric instead of the standard IoU metric. - # Note that the threshold you set for iou_threshold should be negative with this setting on. - 'use_change_matching': False, - - # Uses the same network format as mask_proto_net, except this time it's for adding extra head layers before the final - # prediction in prediction modules. If this is none, no extra layers will be added. - 'extra_head_net': None, - - # What params should the final head layers have (the ones that predict box, confidence, and mask coeffs) - 'head_layer_params': {'kernel_size': 3, 'padding': 1}, - - # Add extra layers between the backbone and the network heads - # The order is (bbox, conf, mask) - 'extra_layers': (0, 0, 0), - - # During training, to match detections with gt, first compute the maximum gt IoU for each prior. - # Then, any of those priors whose maximum overlap is over the positive threshold, mark as positive. - # For any priors whose maximum is less than the negative iou threshold, mark them as negative. - # The rest are neutral and not used in calculating the loss. - 'positive_iou_threshold': 0.5, - 'negative_iou_threshold': 0.5, - - # When using ohem, the ratio between positives and negatives (3 means 3 negatives to 1 positive) - 'ohem_negpos_ratio': 3, - - # If less than 1, anchors treated as a negative that have a crowd iou over this threshold with - # the crowd boxes will be treated as a neutral. - 'crowd_iou_threshold': 1, - - # This is filled in at runtime by Yolact's __init__, so don't touch it - 'mask_dim': None, - - # Input image size. - 'max_size': 300, - - # Whether or not to do post processing on the cpu at test time - 'force_cpu_nms': True, - - # Whether to use mask coefficient cosine similarity nms instead of bbox iou nms - 'use_coeff_nms': False, - - # Whether or not to have a separate branch whose sole purpose is to act as the coefficients for coeff_diversity_loss - # Remember to turn on coeff_diversity_loss, or these extra coefficients won't do anything! - # To see their effect, also remember to turn on use_coeff_nms. - 'use_instance_coeff': False, - 'num_instance_coeffs': 64, - - # Whether or not to tie the mask loss / box loss to 0 - 'train_masks': True, - 'train_boxes': True, - # If enabled, the gt masks will be cropped using the gt bboxes instead of the predicted ones. - # This speeds up training time considerably but results in much worse mAP at test time. - 'use_gt_bboxes': False, - - # Whether or not to preserve aspect ratio when resizing the image. - # If True, this will resize all images to be max_size^2 pixels in area while keeping aspect ratio. - # If False, all images are resized to max_size x max_size - 'preserve_aspect_ratio': False, - - # Whether or not to use the prediction module (c) from DSSD - 'use_prediction_module': False, - - # Whether or not to use the predicted coordinate scheme from Yolo v2 - 'use_yolo_regressors': False, - - # For training, bboxes are considered "positive" if their anchors have a 0.5 IoU overlap - # or greater with a ground truth box. If this is true, instead of using the anchor boxes - # for this IoU computation, the matching function will use the predicted bbox coordinates. - # Don't turn this on if you're not using yolo regressors! - 'use_prediction_matching': False, - - # A list of settings to apply after the specified iteration. Each element of the list should look like - # (iteration, config_dict) where config_dict is a dictionary you'd pass into a config object's init. - 'delayed_settings': [], - - # Use command-line arguments to set this. - 'no_jit': False, - - 'backbone': None, - 'name': 'base_config', - - # Fast Mask Re-scoring Network - # Inspried by Mask Scoring R-CNN (https://arxiv.org/abs/1903.00241) - # Do not crop out the mask with bbox but slide a convnet on the image-size mask, - # then use global pooling to get the final mask score - 'use_maskiou': False, - - # Archecture for the mask iou network. A (num_classes-1, 1, {}) layer is appended to the end. - 'maskiou_net': [], - - # Discard predicted masks whose area is less than this - 'discard_mask_area': -1, - - 'maskiou_alpha': 1.0, - 'rescore_mask': False, - 'rescore_bbox': False, - 'maskious_to_train': -1, -}) - - - +coco_base_config = Config( + { + "dataset": coco2014_dataset, + "num_classes": 81, # This should include the background class + "max_iter": 400000, + # The maximum number of detections for evaluation + "max_num_detections": 100, + # dw' = momentum * dw - lr * (grad + decay * w) + "lr": 1e-3, + "momentum": 0.9, + "decay": 5e-4, + # For each lr step, what to multiply the lr with + "gamma": 0.1, + "lr_steps": (280000, 360000, 400000), + # Initial learning rate to linearly warmup from (if until > 0) + "lr_warmup_init": 1e-4, + # If > 0 then increase the lr linearly from warmup_init to lr each iter for until iters + "lr_warmup_until": 500, + # The terms to scale the respective loss by + "conf_alpha": 1, + "bbox_alpha": 1.5, + "mask_alpha": 0.4 + / 256 + * 140 + * 140, # Some funky equation. Don't worry about it. + # Eval.py sets this if you just want to run YOLACT as a detector + "eval_mask_branch": True, + # Top_k examples to consider for NMS + "nms_top_k": 200, + # Examples with confidence less than this are not considered by NMS + "nms_conf_thresh": 0.05, + # Boxes with IoU overlap greater than this threshold will be culled during NMS + "nms_thresh": 0.5, + # See mask_type for details. + "mask_type": mask_type.direct, + "mask_size": 16, + "masks_to_train": 100, + "mask_proto_src": None, + "mask_proto_net": [(256, 3, {}), (256, 3, {})], + "mask_proto_bias": False, + "mask_proto_prototype_activation": activation_func.relu, + "mask_proto_mask_activation": activation_func.sigmoid, + "mask_proto_coeff_activation": activation_func.tanh, + "mask_proto_crop": True, + "mask_proto_crop_expand": 0, + "mask_proto_loss": None, + "mask_proto_binarize_downsampled_gt": True, + "mask_proto_normalize_mask_loss_by_sqrt_area": False, + "mask_proto_reweight_mask_loss": False, + "mask_proto_grid_file": "data/grid.npy", + "mask_proto_use_grid": False, + "mask_proto_coeff_gate": False, + "mask_proto_prototypes_as_features": False, + "mask_proto_prototypes_as_features_no_grad": False, + "mask_proto_remove_empty_masks": False, + "mask_proto_reweight_coeff": 1, + "mask_proto_coeff_diversity_loss": False, + "mask_proto_coeff_diversity_alpha": 1, + "mask_proto_normalize_emulate_roi_pooling": False, + "mask_proto_double_loss": False, + "mask_proto_double_loss_alpha": 1, + "mask_proto_split_prototypes_by_head": False, + "mask_proto_crop_with_pred_box": False, + # SSD data augmentation parameters + # Randomize hue, vibrance, etc. + "augment_photometric_distort": True, + # Have a chance to scale down the image and pad (to emulate smaller detections) + "augment_expand": True, + # Potentialy sample a random crop from the image and put it in a random place + "augment_random_sample_crop": True, + # Mirror the image with a probability of 1/2 + "augment_random_mirror": True, + # Flip the image vertically with a probability of 1/2 + "augment_random_flip": False, + # With uniform probability, rotate the image [0,90,180,270] degrees + "augment_random_rot90": False, + # Discard detections with width and height smaller than this (in absolute width and height) + "discard_box_width": 4 / 550, + "discard_box_height": 4 / 550, + # If using batchnorm anywhere in the backbone, freeze the batchnorm layer during training. + # Note: any additional batch norm layers after the backbone will not be frozen. + "freeze_bn": False, + # Set this to a config object if you want an FPN (inherit from fpn_base). See fpn_base for details. + "fpn": None, + # Use the same weights for each network head + "share_prediction_module": False, + # For hard negative mining, instead of using the negatives that are leastl confidently background, + # use negatives that are most confidently not background. + "ohem_use_most_confident": False, + # Use focal loss as described in https://arxiv.org/pdf/1708.02002.pdf instead of OHEM + "use_focal_loss": False, + "focal_loss_alpha": 0.25, + "focal_loss_gamma": 2, + # The initial bias toward forground objects, as specified in the focal loss paper + "focal_loss_init_pi": 0.01, + # Keeps track of the average number of examples for each class, and weights the loss for that class accordingly. + "use_class_balanced_conf": False, + # Whether to use sigmoid focal loss instead of softmax, all else being the same. + "use_sigmoid_focal_loss": False, + # Use class[0] to be the objectness score and class[1:] to be the softmax predicted class. + # Note: at the moment this is only implemented if use_focal_loss is on. + "use_objectness_score": False, + # Adds a global pool + fc layer to the smallest selected layer that predicts the existence of each of the 80 classes. + # This branch is only evaluated during training time and is just there for multitask learning. + "use_class_existence_loss": False, + "class_existence_alpha": 1, + # Adds a 1x1 convolution directly to the biggest selected layer that predicts a semantic segmentations for each of the 80 classes. + # This branch is only evaluated during training time and is just there for multitask learning. + "use_semantic_segmentation_loss": False, + "semantic_segmentation_alpha": 1, + # Adds another branch to the netwok to predict Mask IoU. + "use_mask_scoring": False, + "mask_scoring_alpha": 1, + # Match gt boxes using the Box2Pix change metric instead of the standard IoU metric. + # Note that the threshold you set for iou_threshold should be negative with this setting on. + "use_change_matching": False, + # Uses the same network format as mask_proto_net, except this time it's for adding extra head layers before the final + # prediction in prediction modules. If this is none, no extra layers will be added. + "extra_head_net": None, + # What params should the final head layers have (the ones that predict box, confidence, and mask coeffs) + "head_layer_params": {"kernel_size": 3, "padding": 1}, + # Add extra layers between the backbone and the network heads + # The order is (bbox, conf, mask) + "extra_layers": (0, 0, 0), + # During training, to match detections with gt, first compute the maximum gt IoU for each prior. + # Then, any of those priors whose maximum overlap is over the positive threshold, mark as positive. + # For any priors whose maximum is less than the negative iou threshold, mark them as negative. + # The rest are neutral and not used in calculating the loss. + "positive_iou_threshold": 0.5, + "negative_iou_threshold": 0.5, + # When using ohem, the ratio between positives and negatives (3 means 3 negatives to 1 positive) + "ohem_negpos_ratio": 3, + # If less than 1, anchors treated as a negative that have a crowd iou over this threshold with + # the crowd boxes will be treated as a neutral. + "crowd_iou_threshold": 1, + # This is filled in at runtime by Yolact's __init__, so don't touch it + "mask_dim": None, + # Input image size. + "max_size": 300, + # Whether or not to do post processing on the cpu at test time + "force_cpu_nms": True, + # Whether to use mask coefficient cosine similarity nms instead of bbox iou nms + "use_coeff_nms": False, + # Whether or not to have a separate branch whose sole purpose is to act as the coefficients for coeff_diversity_loss + # Remember to turn on coeff_diversity_loss, or these extra coefficients won't do anything! + # To see their effect, also remember to turn on use_coeff_nms. + "use_instance_coeff": False, + "num_instance_coeffs": 64, + # Whether or not to tie the mask loss / box loss to 0 + "train_masks": True, + "train_boxes": True, + # If enabled, the gt masks will be cropped using the gt bboxes instead of the predicted ones. + # This speeds up training time considerably but results in much worse mAP at test time. + "use_gt_bboxes": False, + # Whether or not to preserve aspect ratio when resizing the image. + # If True, this will resize all images to be max_size^2 pixels in area while keeping aspect ratio. + # If False, all images are resized to max_size x max_size + "preserve_aspect_ratio": False, + # Whether or not to use the prediction module (c) from DSSD + "use_prediction_module": False, + # Whether or not to use the predicted coordinate scheme from Yolo v2 + "use_yolo_regressors": False, + # For training, bboxes are considered "positive" if their anchors have a 0.5 IoU overlap + # or greater with a ground truth box. If this is true, instead of using the anchor boxes + # for this IoU computation, the matching function will use the predicted bbox coordinates. + # Don't turn this on if you're not using yolo regressors! + "use_prediction_matching": False, + # A list of settings to apply after the specified iteration. Each element of the list should look like + # (iteration, config_dict) where config_dict is a dictionary you'd pass into a config object's init. + "delayed_settings": [], + # Use command-line arguments to set this. + "no_jit": False, + "backbone": None, + "name": "base_config", + # Fast Mask Re-scoring Network + # Inspried by Mask Scoring R-CNN (https://arxiv.org/abs/1903.00241) + # Do not crop out the mask with bbox but slide a convnet on the image-size mask, + # then use global pooling to get the final mask score + "use_maskiou": False, + # Archecture for the mask iou network. A (num_classes-1, 1, {}) layer is appended to the end. + "maskiou_net": [], + # Discard predicted masks whose area is less than this + "discard_mask_area": -1, + "maskiou_alpha": 1.0, + "rescore_mask": False, + "rescore_bbox": False, + "maskious_to_train": -1, + } +) # ----------------------- YOLACT v1.0 CONFIGS ----------------------- # -yolact_base_config = coco_base_config.copy({ - 'name': 'yolact_base', - - # Dataset stuff - 'dataset': coco2017_dataset, - 'num_classes': len(coco2017_dataset.class_names) + 1, - - # Image Size - 'max_size': 550, - - # Training params - 'lr_steps': (280000, 600000, 700000, 750000), - 'max_iter': 800000, - - # Backbone Settings - 'backbone': resnet101_backbone.copy({ - 'selected_layers': list(range(1, 4)), - 'use_pixel_scales': True, - 'preapply_sqrt': False, - 'use_square_anchors': True, # This is for backward compatability with a bug - - 'pred_aspect_ratios': [ [[1, 1/2, 2]] ]*5, - 'pred_scales': [[24], [48], [96], [192], [384]], - }), - - # FPN Settings - 'fpn': fpn_base.copy({ - 'use_conv_downsample': True, - 'num_downsample': 2, - }), - - # Mask Settings - 'mask_type': mask_type.lincomb, - 'mask_alpha': 6.125, - 'mask_proto_src': 0, - 'mask_proto_net': [(256, 3, {'padding': 1})] * 3 + [(None, -2, {}), (256, 3, {'padding': 1})] + [(32, 1, {})], - 'mask_proto_normalize_emulate_roi_pooling': True, - - # Other stuff - 'share_prediction_module': True, - 'extra_head_net': [(256, 3, {'padding': 1})], - - 'positive_iou_threshold': 0.5, - 'negative_iou_threshold': 0.4, - - 'crowd_iou_threshold': 0.7, - - 'use_semantic_segmentation_loss': True, -}) - -yolact_im400_config = yolact_base_config.copy({ - 'name': 'yolact_im400', - - 'max_size': 400, - 'backbone': yolact_base_config.backbone.copy({ - 'pred_scales': [[int(x[0] / yolact_base_config.max_size * 400)] for x in yolact_base_config.backbone.pred_scales], - }), -}) - -yolact_im700_config = yolact_base_config.copy({ - 'name': 'yolact_im700', - - 'masks_to_train': 300, - 'max_size': 700, - 'backbone': yolact_base_config.backbone.copy({ - 'pred_scales': [[int(x[0] / yolact_base_config.max_size * 700)] for x in yolact_base_config.backbone.pred_scales], - }), -}) - -yolact_darknet53_config = yolact_base_config.copy({ - 'name': 'yolact_darknet53', - - 'backbone': darknet53_backbone.copy({ - 'selected_layers': list(range(2, 5)), - - 'pred_scales': yolact_base_config.backbone.pred_scales, - 'pred_aspect_ratios': yolact_base_config.backbone.pred_aspect_ratios, - 'use_pixel_scales': True, - 'preapply_sqrt': False, - 'use_square_anchors': True, # This is for backward compatability with a bug - }), -}) - -yolact_resnet50_config = yolact_base_config.copy({ - 'name': 'yolact_resnet50', - - 'backbone': resnet50_backbone.copy({ - 'selected_layers': list(range(1, 4)), - - 'pred_scales': yolact_base_config.backbone.pred_scales, - 'pred_aspect_ratios': yolact_base_config.backbone.pred_aspect_ratios, - 'use_pixel_scales': True, - 'preapply_sqrt': False, - 'use_square_anchors': True, # This is for backward compatability with a bug - }), -}) - - -yolact_resnet50_pascal_config = yolact_resnet50_config.copy({ - 'name': None, # Will default to yolact_resnet50_pascal - - # Dataset stuff - 'dataset': pascal_sbd_dataset, - 'num_classes': len(pascal_sbd_dataset.class_names) + 1, - - 'max_iter': 120000, - 'lr_steps': (60000, 100000), - - 'backbone': yolact_resnet50_config.backbone.copy({ - 'pred_scales': [[32], [64], [128], [256], [512]], - 'use_square_anchors': False, - }) -}) +yolact_base_config = coco_base_config.copy( + { + "name": "yolact_base", + # Dataset stuff + "dataset": coco2017_dataset, + "num_classes": len(coco2017_dataset.class_names) + 1, + # Image Size + "max_size": 550, + # Training params + "lr_steps": (280000, 600000, 700000, 750000), + "max_iter": 800000, + # Backbone Settings + "backbone": resnet101_backbone.copy( + { + "selected_layers": list(range(1, 4)), + "use_pixel_scales": True, + "preapply_sqrt": False, + "use_square_anchors": True, # This is for backward compatability with a bug + "pred_aspect_ratios": [[[1, 1 / 2, 2]]] * 5, + "pred_scales": [[24], [48], [96], [192], [384]], + } + ), + # FPN Settings + "fpn": fpn_base.copy({"use_conv_downsample": True, "num_downsample": 2,}), + # Mask Settings + "mask_type": mask_type.lincomb, + "mask_alpha": 6.125, + "mask_proto_src": 0, + "mask_proto_net": [(256, 3, {"padding": 1})] * 3 + + [(None, -2, {}), (256, 3, {"padding": 1})] + + [(32, 1, {})], + "mask_proto_normalize_emulate_roi_pooling": True, + # Other stuff + "share_prediction_module": True, + "extra_head_net": [(256, 3, {"padding": 1})], + "positive_iou_threshold": 0.5, + "negative_iou_threshold": 0.4, + "crowd_iou_threshold": 0.7, + "use_semantic_segmentation_loss": True, + } +) + +yolact_im400_config = yolact_base_config.copy( + { + "name": "yolact_im400", + "max_size": 400, + "backbone": yolact_base_config.backbone.copy( + { + "pred_scales": [ + [int(x[0] / yolact_base_config.max_size * 400)] + for x in yolact_base_config.backbone.pred_scales + ], + } + ), + } +) + +yolact_im700_config = yolact_base_config.copy( + { + "name": "yolact_im700", + "masks_to_train": 300, + "max_size": 700, + "backbone": yolact_base_config.backbone.copy( + { + "pred_scales": [ + [int(x[0] / yolact_base_config.max_size * 700)] + for x in yolact_base_config.backbone.pred_scales + ], + } + ), + } +) + +yolact_darknet53_config = yolact_base_config.copy( + { + "name": "yolact_darknet53", + "backbone": darknet53_backbone.copy( + { + "selected_layers": list(range(2, 5)), + "pred_scales": yolact_base_config.backbone.pred_scales, + "pred_aspect_ratios": yolact_base_config.backbone.pred_aspect_ratios, + "use_pixel_scales": True, + "preapply_sqrt": False, + "use_square_anchors": True, # This is for backward compatability with a bug + } + ), + } +) + +yolact_resnet50_config = yolact_base_config.copy( + { + "name": "yolact_resnet50", + "backbone": resnet50_backbone.copy( + { + "selected_layers": list(range(1, 4)), + "pred_scales": yolact_base_config.backbone.pred_scales, + "pred_aspect_ratios": yolact_base_config.backbone.pred_aspect_ratios, + "use_pixel_scales": True, + "preapply_sqrt": False, + "use_square_anchors": True, # This is for backward compatability with a bug + } + ), + } +) + + +yolact_resnet50_pascal_config = yolact_resnet50_config.copy( + { + "name": None, # Will default to yolact_resnet50_pascal + # Dataset stuff + "dataset": pascal_sbd_dataset, + "num_classes": len(pascal_sbd_dataset.class_names) + 1, + "max_iter": 120000, + "lr_steps": (60000, 100000), + "backbone": yolact_resnet50_config.backbone.copy( + { + "pred_scales": [[32], [64], [128], [256], [512]], + "use_square_anchors": False, + } + ), + } +) # ----------------------- YOLACT++ CONFIGS ----------------------- # -yolact_plus_base_config = yolact_base_config.copy({ - 'name': 'yolact_plus_base', - - 'backbone': resnet101_dcn_inter3_backbone.copy({ - 'selected_layers': list(range(1, 4)), - - 'pred_aspect_ratios': [ [[1, 1/2, 2]] ]*5, - 'pred_scales': [[i * 2 ** (j / 3.0) for j in range(3)] for i in [24, 48, 96, 192, 384]], - 'use_pixel_scales': True, - 'preapply_sqrt': False, - 'use_square_anchors': False, - }), - - 'use_maskiou': True, - 'maskiou_net': [(8, 3, {'stride': 2}), (16, 3, {'stride': 2}), (32, 3, {'stride': 2}), (64, 3, {'stride': 2}), (128, 3, {'stride': 2})], - 'maskiou_alpha': 25, - 'rescore_bbox': False, - 'rescore_mask': True, - - 'discard_mask_area': 5*5, -}) - -yolact_plus_resnet50_config = yolact_plus_base_config.copy({ - 'name': 'yolact_plus_resnet50', - - 'backbone': resnet50_dcnv2_backbone.copy({ - 'selected_layers': list(range(1, 4)), - - 'pred_aspect_ratios': [ [[1, 1/2, 2]] ]*5, - 'pred_scales': [[i * 2 ** (j / 3.0) for j in range(3)] for i in [24, 48, 96, 192, 384]], - 'use_pixel_scales': True, - 'preapply_sqrt': False, - 'use_square_anchors': False, - }), -}) +yolact_plus_base_config = yolact_base_config.copy( + { + "name": "yolact_plus_base", + "backbone": resnet101_dcn_inter3_backbone.copy( + { + "selected_layers": list(range(1, 4)), + "pred_aspect_ratios": [[[1, 1 / 2, 2]]] * 5, + "pred_scales": [ + [i * 2 ** (j / 3.0) for j in range(3)] + for i in [24, 48, 96, 192, 384] + ], + "use_pixel_scales": True, + "preapply_sqrt": False, + "use_square_anchors": False, + } + ), + "use_maskiou": True, + "maskiou_net": [ + (8, 3, {"stride": 2}), + (16, 3, {"stride": 2}), + (32, 3, {"stride": 2}), + (64, 3, {"stride": 2}), + (128, 3, {"stride": 2}), + ], + "maskiou_alpha": 25, + "rescore_bbox": False, + "rescore_mask": True, + "discard_mask_area": 5 * 5, + } +) + +yolact_plus_resnet50_config = yolact_plus_base_config.copy( + { + "name": "yolact_plus_resnet50", + "backbone": resnet50_dcnv2_backbone.copy( + { + "selected_layers": list(range(1, 4)), + "pred_aspect_ratios": [[[1, 1 / 2, 2]]] * 5, + "pred_scales": [ + [i * 2 ** (j / 3.0) for j in range(3)] + for i in [24, 48, 96, 192, 384] + ], + "use_pixel_scales": True, + "preapply_sqrt": False, + "use_square_anchors": False, + } + ), + } +) # Default config cfg = yolact_base_config.copy() -def set_cfg(config_name:str): + +def set_cfg(config_name: str): """ Sets the active config. Works even if cfg is already imported! """ global cfg @@ -818,9 +963,9 @@ def set_cfg(config_name:str): cfg.replace(eval(config_name)) if cfg.name is None: - cfg.name = config_name.split('_config')[0] + cfg.name = config_name.split("_config")[0] + -def set_dataset(dataset_name:str): +def set_dataset(dataset_name: str): """ Sets the dataset of the current config. """ cfg.dataset = eval(dataset_name) - diff --git a/data/scripts/mix_sets.py b/data/scripts/mix_sets.py index 5f70eacb5..646d5678f 100644 --- a/data/scripts/mix_sets.py +++ b/data/scripts/mix_sets.py @@ -24,51 +24,51 @@ This will take the last 5k images from val2014 and put it in instances_minival5k.json. """ -annotations_path = 'data/coco/annotations/instances_%s.json' -fields_to_combine = ('images', 'annotations') -fields_to_steal = ('info', 'categories', 'licenses') - -if __name__ == '__main__': - if len(sys.argv) < 4 or len(sys.argv) % 2 != 0: - print(usage_text) - exit() - - out_name = sys.argv[1] - sets = sys.argv[2:] - sets = [(sets[2*i], sets[2*i+1]) for i in range(len(sets)//2)] - - out = {x: [] for x in fields_to_combine} - - for idx, (set_name, range_str) in enumerate(sets): - print('Loading set %s...' % set_name) - with open(annotations_path % set_name, 'r') as f: - set_json = json.load(f) - - # "Steal" some fields that don't need to be combined from the first set - if idx == 0: - for field in fields_to_steal: - out[field] = set_json[field] - - print('Building image index...') - image_idx = {x['id']: x for x in set_json['images']} - - print('Collecting annotations...') - anns_idx = defaultdict(lambda: []) - - for ann in set_json['annotations']: - anns_idx[ann['image_id']].append(ann) - - export_ids = list(image_idx.keys()) - export_ids.sort() - export_ids = eval('export_ids[%s]' % range_str, {}, {'export_ids': export_ids}) - - print('Adding %d images...' % len(export_ids)) - for _id in export_ids: - out['images'].append(image_idx[_id]) - out['annotations'] += anns_idx[_id] - - print('Done.\n') - - print('Saving result...') - with open(annotations_path % (out_name), 'w') as out_file: - json.dump(out, out_file) +annotations_path = "data/coco/annotations/instances_%s.json" +fields_to_combine = ("images", "annotations") +fields_to_steal = ("info", "categories", "licenses") + +if __name__ == "__main__": + if len(sys.argv) < 4 or len(sys.argv) % 2 != 0: + print(usage_text) + exit() + + out_name = sys.argv[1] + sets = sys.argv[2:] + sets = [(sets[2 * i], sets[2 * i + 1]) for i in range(len(sets) // 2)] + + out = {x: [] for x in fields_to_combine} + + for idx, (set_name, range_str) in enumerate(sets): + print("Loading set %s..." % set_name) + with open(annotations_path % set_name, "r") as f: + set_json = json.load(f) + + # "Steal" some fields that don't need to be combined from the first set + if idx == 0: + for field in fields_to_steal: + out[field] = set_json[field] + + print("Building image index...") + image_idx = {x["id"]: x for x in set_json["images"]} + + print("Collecting annotations...") + anns_idx = defaultdict(lambda: []) + + for ann in set_json["annotations"]: + anns_idx[ann["image_id"]].append(ann) + + export_ids = list(image_idx.keys()) + export_ids.sort() + export_ids = eval("export_ids[%s]" % range_str, {}, {"export_ids": export_ids}) + + print("Adding %d images..." % len(export_ids)) + for _id in export_ids: + out["images"].append(image_idx[_id]) + out["annotations"] += anns_idx[_id] + + print("Done.\n") + + print("Saving result...") + with open(annotations_path % (out_name), "w") as out_file: + json.dump(out, out_file) diff --git a/eval.py b/eval.py index 547bc0aae..84a12788a 100644 --- a/eval.py +++ b/eval.py @@ -29,110 +29,274 @@ import matplotlib.pyplot as plt import cv2 + def str2bool(v): - if v.lower() in ('yes', 'true', 't', 'y', '1'): + if v.lower() in ("yes", "true", "t", "y", "1"): return True - elif v.lower() in ('no', 'false', 'f', 'n', '0'): + elif v.lower() in ("no", "false", "f", "n", "0"): return False else: - raise argparse.ArgumentTypeError('Boolean value expected.') + raise argparse.ArgumentTypeError("Boolean value expected.") + def parse_args(argv=None): - parser = argparse.ArgumentParser( - description='YOLACT COCO Evaluation') - parser.add_argument('--trained_model', - default='weights/ssd300_mAP_77.43_v2.pth', type=str, - help='Trained state_dict file path to open. If "interrupt", this will open the interrupt file.') - parser.add_argument('--top_k', default=5, type=int, - help='Further restrict the number of predictions to parse') - parser.add_argument('--cuda', default=True, type=str2bool, - help='Use cuda to evaulate model') - parser.add_argument('--fast_nms', default=True, type=str2bool, - help='Whether to use a faster, but not entirely correct version of NMS.') - parser.add_argument('--cross_class_nms', default=False, type=str2bool, - help='Whether compute NMS cross-class or per-class.') - parser.add_argument('--display_masks', default=True, type=str2bool, - help='Whether or not to display masks over bounding boxes') - parser.add_argument('--display_bboxes', default=True, type=str2bool, - help='Whether or not to display bboxes around masks') - parser.add_argument('--display_text', default=True, type=str2bool, - help='Whether or not to display text (class [score])') - parser.add_argument('--display_scores', default=True, type=str2bool, - help='Whether or not to display scores in addition to classes') - parser.add_argument('--display', dest='display', action='store_true', - help='Display qualitative results instead of quantitative ones.') - parser.add_argument('--shuffle', dest='shuffle', action='store_true', - help='Shuffles the images when displaying them. Doesn\'t have much of an effect when display is off though.') - parser.add_argument('--ap_data_file', default='results/ap_data.pkl', type=str, - help='In quantitative mode, the file to save detections before calculating mAP.') - parser.add_argument('--resume', dest='resume', action='store_true', - help='If display not set, this resumes mAP calculations from the ap_data_file.') - parser.add_argument('--max_images', default=-1, type=int, - help='The maximum number of images from the dataset to consider. Use -1 for all.') - parser.add_argument('--output_coco_json', dest='output_coco_json', action='store_true', - help='If display is not set, instead of processing IoU values, this just dumps detections into the coco json file.') - parser.add_argument('--bbox_det_file', default='results/bbox_detections.json', type=str, - help='The output file for coco bbox results if --coco_results is set.') - parser.add_argument('--mask_det_file', default='results/mask_detections.json', type=str, - help='The output file for coco mask results if --coco_results is set.') - parser.add_argument('--config', default=None, - help='The config object to use.') - parser.add_argument('--output_web_json', dest='output_web_json', action='store_true', - help='If display is not set, instead of processing IoU values, this dumps detections for usage with the detections viewer web thingy.') - parser.add_argument('--web_det_path', default='web/dets/', type=str, - help='If output_web_json is set, this is the path to dump detections into.') - parser.add_argument('--no_bar', dest='no_bar', action='store_true', - help='Do not output the status bar. This is useful for when piping to a file.') - parser.add_argument('--display_lincomb', default=False, type=str2bool, - help='If the config uses lincomb masks, output a visualization of how those masks are created.') - parser.add_argument('--benchmark', default=False, dest='benchmark', action='store_true', - help='Equivalent to running display mode but without displaying an image.') - parser.add_argument('--no_sort', default=False, dest='no_sort', action='store_true', - help='Do not sort images by hashed image ID.') - parser.add_argument('--seed', default=None, type=int, - help='The seed to pass into random.seed. Note: this is only really for the shuffle and does not (I think) affect cuda stuff.') - parser.add_argument('--mask_proto_debug', default=False, dest='mask_proto_debug', action='store_true', - help='Outputs stuff for scripts/compute_mask.py.') - parser.add_argument('--no_crop', default=False, dest='crop', action='store_false', - help='Do not crop output masks with the predicted bounding box.') - parser.add_argument('--image', default=None, type=str, - help='A path to an image to use for display.') - parser.add_argument('--images', default=None, type=str, - help='An input folder of images and output folder to save detected images. Should be in the format input->output.') - parser.add_argument('--video', default=None, type=str, - help='A path to a video to evaluate on. Passing in a number will use that index webcam.') - parser.add_argument('--video_multiframe', default=1, type=int, - help='The number of frames to evaluate in parallel to make videos play at higher fps.') - parser.add_argument('--score_threshold', default=0, type=float, - help='Detections with a score under this threshold will not be considered. This currently only works in display mode.') - parser.add_argument('--dataset', default=None, type=str, - help='If specified, override the dataset specified in the config with this one (example: coco2017_dataset).') - parser.add_argument('--detect', default=False, dest='detect', action='store_true', - help='Don\'t evauluate the mask branch at all and only do object detection. This only works for --display and --benchmark.') - parser.add_argument('--display_fps', default=False, dest='display_fps', action='store_true', - help='When displaying / saving video, draw the FPS on the frame') - parser.add_argument('--emulate_playback', default=False, dest='emulate_playback', action='store_true', - help='When saving a video, emulate the framerate that you\'d get running in real-time mode.') - - parser.set_defaults(no_bar=False, display=False, resume=False, output_coco_json=False, output_web_json=False, shuffle=False, - benchmark=False, no_sort=False, no_hash=False, mask_proto_debug=False, crop=True, detect=False, display_fps=False, - emulate_playback=False) + parser = argparse.ArgumentParser(description="YOLACT COCO Evaluation") + parser.add_argument( + "--trained_model", + default="weights/ssd300_mAP_77.43_v2.pth", + type=str, + help='Trained state_dict file path to open. If "interrupt", this will open the interrupt file.', + ) + parser.add_argument( + "--top_k", + default=5, + type=int, + help="Further restrict the number of predictions to parse", + ) + parser.add_argument( + "--cuda", default=True, type=str2bool, help="Use cuda to evaulate model" + ) + parser.add_argument( + "--fast_nms", + default=True, + type=str2bool, + help="Whether to use a faster, but not entirely correct version of NMS.", + ) + parser.add_argument( + "--cross_class_nms", + default=False, + type=str2bool, + help="Whether compute NMS cross-class or per-class.", + ) + parser.add_argument( + "--display_masks", + default=True, + type=str2bool, + help="Whether or not to display masks over bounding boxes", + ) + parser.add_argument( + "--display_bboxes", + default=True, + type=str2bool, + help="Whether or not to display bboxes around masks", + ) + parser.add_argument( + "--display_text", + default=True, + type=str2bool, + help="Whether or not to display text (class [score])", + ) + parser.add_argument( + "--display_scores", + default=True, + type=str2bool, + help="Whether or not to display scores in addition to classes", + ) + parser.add_argument( + "--display", + dest="display", + action="store_true", + help="Display qualitative results instead of quantitative ones.", + ) + parser.add_argument( + "--shuffle", + dest="shuffle", + action="store_true", + help="Shuffles the images when displaying them. Doesn't have much of an effect when display is off though.", + ) + parser.add_argument( + "--ap_data_file", + default="results/ap_data.pkl", + type=str, + help="In quantitative mode, the file to save detections before calculating mAP.", + ) + parser.add_argument( + "--resume", + dest="resume", + action="store_true", + help="If display not set, this resumes mAP calculations from the ap_data_file.", + ) + parser.add_argument( + "--max_images", + default=-1, + type=int, + help="The maximum number of images from the dataset to consider. Use -1 for all.", + ) + parser.add_argument( + "--output_coco_json", + dest="output_coco_json", + action="store_true", + help="If display is not set, instead of processing IoU values, this just dumps detections into the coco json file.", + ) + parser.add_argument( + "--bbox_det_file", + default="results/bbox_detections.json", + type=str, + help="The output file for coco bbox results if --coco_results is set.", + ) + parser.add_argument( + "--mask_det_file", + default="results/mask_detections.json", + type=str, + help="The output file for coco mask results if --coco_results is set.", + ) + parser.add_argument("--config", default=None, help="The config object to use.") + parser.add_argument( + "--output_web_json", + dest="output_web_json", + action="store_true", + help="If display is not set, instead of processing IoU values, this dumps detections for usage with the detections viewer web thingy.", + ) + parser.add_argument( + "--web_det_path", + default="web/dets/", + type=str, + help="If output_web_json is set, this is the path to dump detections into.", + ) + parser.add_argument( + "--no_bar", + dest="no_bar", + action="store_true", + help="Do not output the status bar. This is useful for when piping to a file.", + ) + parser.add_argument( + "--display_lincomb", + default=False, + type=str2bool, + help="If the config uses lincomb masks, output a visualization of how those masks are created.", + ) + parser.add_argument( + "--benchmark", + default=False, + dest="benchmark", + action="store_true", + help="Equivalent to running display mode but without displaying an image.", + ) + parser.add_argument( + "--no_sort", + default=False, + dest="no_sort", + action="store_true", + help="Do not sort images by hashed image ID.", + ) + parser.add_argument( + "--seed", + default=None, + type=int, + help="The seed to pass into random.seed. Note: this is only really for the shuffle and does not (I think) affect cuda stuff.", + ) + parser.add_argument( + "--mask_proto_debug", + default=False, + dest="mask_proto_debug", + action="store_true", + help="Outputs stuff for scripts/compute_mask.py.", + ) + parser.add_argument( + "--no_crop", + default=False, + dest="crop", + action="store_false", + help="Do not crop output masks with the predicted bounding box.", + ) + parser.add_argument( + "--image", default=None, type=str, help="A path to an image to use for display." + ) + parser.add_argument( + "--images", + default=None, + type=str, + help="An input folder of images and output folder to save detected images. Should be in the format input->output.", + ) + parser.add_argument( + "--video", + default=None, + type=str, + help="A path to a video to evaluate on. Passing in a number will use that index webcam.", + ) + parser.add_argument( + "--video_multiframe", + default=1, + type=int, + help="The number of frames to evaluate in parallel to make videos play at higher fps.", + ) + parser.add_argument( + "--score_threshold", + default=0, + type=float, + help="Detections with a score under this threshold will not be considered. This currently only works in display mode.", + ) + parser.add_argument( + "--dataset", + default=None, + type=str, + help="If specified, override the dataset specified in the config with this one (example: coco2017_dataset).", + ) + parser.add_argument( + "--detect", + default=False, + dest="detect", + action="store_true", + help="Don't evauluate the mask branch at all and only do object detection. This only works for --display and --benchmark.", + ) + parser.add_argument( + "--display_fps", + default=False, + dest="display_fps", + action="store_true", + help="When displaying / saving video, draw the FPS on the frame", + ) + parser.add_argument( + "--emulate_playback", + default=False, + dest="emulate_playback", + action="store_true", + help="When saving a video, emulate the framerate that you'd get running in real-time mode.", + ) + + parser.set_defaults( + no_bar=False, + display=False, + resume=False, + output_coco_json=False, + output_web_json=False, + shuffle=False, + benchmark=False, + no_sort=False, + no_hash=False, + mask_proto_debug=False, + crop=True, + detect=False, + display_fps=False, + emulate_playback=False, + ) global args args = parser.parse_args(argv) if args.output_web_json: args.output_coco_json = True - + if args.seed is not None: random.seed(args.seed) + iou_thresholds = [x / 100 for x in range(50, 100, 5)] -coco_cats = {} # Call prep_coco_cats to fill this +coco_cats = {} # Call prep_coco_cats to fill this coco_cats_inv = {} color_cache = defaultdict(lambda: {}) -def prep_display(dets_out, img, h, w, undo_transform=True, class_color=False, mask_alpha=0.45, fps_str=''): + +def prep_display( + dets_out, + img, + h, + w, + undo_transform=True, + class_color=False, + mask_alpha=0.45, + fps_str="", +): """ Note: If undo_transform=False then im_h and im_w are allowed to be None. """ @@ -142,18 +306,23 @@ def prep_display(dets_out, img, h, w, undo_transform=True, class_color=False, ma else: img_gpu = img / 255.0 h, w, _ = img.shape - - with timer.env('Postprocess'): + + with timer.env("Postprocess"): save = cfg.rescore_bbox cfg.rescore_bbox = True - t = postprocess(dets_out, w, h, visualize_lincomb = args.display_lincomb, - crop_masks = args.crop, - score_threshold = args.score_threshold) + t = postprocess( + dets_out, + w, + h, + visualize_lincomb=args.display_lincomb, + crop_masks=args.crop, + score_threshold=args.score_threshold, + ) cfg.rescore_bbox = save - with timer.env('Copy'): - idx = t[1].argsort(0, descending=True)[:args.top_k] - + with timer.env("Copy"): + idx = t[1].argsort(0, descending=True)[: args.top_k] + if cfg.eval_mask_branch: # Masks are drawn on the GPU, so don't copy masks = t[3][idx] @@ -170,7 +339,7 @@ def prep_display(dets_out, img, h, w, undo_transform=True, class_color=False, ma def get_color(j, on_gpu=None): global color_cache color_idx = (classes[j] * 5 if class_color else j * 5) % len(COLORS) - + if on_gpu is not None and color_idx in color_cache[on_gpu]: return color_cache[on_gpu][color_idx] else: @@ -179,7 +348,7 @@ def get_color(j, on_gpu=None): # The image might come in as RGB or BRG, depending color = (color[2], color[1], color[0]) if on_gpu is not None: - color = torch.Tensor(color).to(on_gpu).float() / 255. + color = torch.Tensor(color).to(on_gpu).float() / 255.0 color_cache[on_gpu][color_idx] = color return color @@ -189,35 +358,42 @@ def get_color(j, on_gpu=None): if args.display_masks and cfg.eval_mask_branch and num_dets_to_consider > 0: # After this, mask is of size [num_dets, h, w, 1] masks = masks[:num_dets_to_consider, :, :, None] - + # Prepare the RGB images for each mask given their color (size [num_dets, h, w, 1]) - colors = torch.cat([get_color(j, on_gpu=img_gpu.device.index).view(1, 1, 1, 3) for j in range(num_dets_to_consider)], dim=0) + colors = torch.cat( + [ + get_color(j, on_gpu=img_gpu.device.index).view(1, 1, 1, 3) + for j in range(num_dets_to_consider) + ], + dim=0, + ) masks_color = masks.repeat(1, 1, 1, 3) * colors * mask_alpha # This is 1 everywhere except for 1-mask_alpha where the mask is inv_alph_masks = masks * (-mask_alpha) + 1 - + # I did the math for this on pen and paper. This whole block should be equivalent to: # for j in range(num_dets_to_consider): # img_gpu = img_gpu * inv_alph_masks[j] + masks_color[j] masks_color_summand = masks_color[0] if num_dets_to_consider > 1: - inv_alph_cumul = inv_alph_masks[:(num_dets_to_consider-1)].cumprod(dim=0) + inv_alph_cumul = inv_alph_masks[: (num_dets_to_consider - 1)].cumprod(dim=0) masks_color_cumul = masks_color[1:] * inv_alph_cumul masks_color_summand += masks_color_cumul.sum(dim=0) img_gpu = img_gpu * inv_alph_masks.prod(dim=0) + masks_color_summand - + if args.display_fps: - # Draw the box for the fps on the GPU + # Draw the box for the fps on the GPU font_face = cv2.FONT_HERSHEY_DUPLEX font_scale = 0.6 font_thickness = 1 - text_w, text_h = cv2.getTextSize(fps_str, font_face, font_scale, font_thickness)[0] - - img_gpu[0:text_h+8, 0:text_w+8] *= 0.6 # 1 - Box alpha + text_w, text_h = cv2.getTextSize( + fps_str, font_face, font_scale, font_thickness + )[0] + img_gpu[0 : text_h + 8, 0 : text_w + 8] *= 0.6 # 1 - Box alpha # Then draw the stuff that needs to be done on the cpu # Note, make sure this is a uint8 tensor or opencv will not anti alias text for whatever reason @@ -228,8 +404,17 @@ def get_color(j, on_gpu=None): text_pt = (4, text_h + 2) text_color = [255, 255, 255] - cv2.putText(img_numpy, fps_str, text_pt, font_face, font_scale, text_color, font_thickness, cv2.LINE_AA) - + cv2.putText( + img_numpy, + fps_str, + text_pt, + font_face, + font_scale, + text_color, + font_thickness, + cv2.LINE_AA, + ) + if num_dets_to_consider == 0: return img_numpy @@ -244,29 +429,46 @@ def get_color(j, on_gpu=None): if args.display_text: _class = cfg.dataset.class_names[classes[j]] - text_str = '%s: %.2f' % (_class, score) if args.display_scores else _class + text_str = ( + "%s: %.2f" % (_class, score) if args.display_scores else _class + ) font_face = cv2.FONT_HERSHEY_DUPLEX font_scale = 0.6 font_thickness = 1 - text_w, text_h = cv2.getTextSize(text_str, font_face, font_scale, font_thickness)[0] + text_w, text_h = cv2.getTextSize( + text_str, font_face, font_scale, font_thickness + )[0] text_pt = (x1, y1 - 3) text_color = [255, 255, 255] - cv2.rectangle(img_numpy, (x1, y1), (x1 + text_w, y1 - text_h - 4), color, -1) - cv2.putText(img_numpy, text_str, text_pt, font_face, font_scale, text_color, font_thickness, cv2.LINE_AA) - - + cv2.rectangle( + img_numpy, (x1, y1), (x1 + text_w, y1 - text_h - 4), color, -1 + ) + cv2.putText( + img_numpy, + text_str, + text_pt, + font_face, + font_scale, + text_color, + font_thickness, + cv2.LINE_AA, + ) + return img_numpy + def prep_benchmark(dets_out, h, w): - with timer.env('Postprocess'): - t = postprocess(dets_out, w, h, crop_masks=args.crop, score_threshold=args.score_threshold) + with timer.env("Postprocess"): + t = postprocess( + dets_out, w, h, crop_masks=args.crop, score_threshold=args.score_threshold + ) - with timer.env('Copy'): - classes, scores, boxes, masks = [x[:args.top_k] for x in t] + with timer.env("Copy"): + classes, scores, boxes, masks = [x[: args.top_k] for x in t] if isinstance(scores, list): box_scores = scores[0].cpu().numpy() mask_scores = scores[1].cpu().numpy() @@ -275,11 +477,12 @@ def prep_benchmark(dets_out, h, w): classes = classes.cpu().numpy() boxes = boxes.cpu().numpy() masks = masks.cpu().numpy() - - with timer.env('Sync'): + + with timer.env("Sync"): # Just in case torch.cuda.synchronize() + def prep_coco_cats(): """ Prepare inverted table for category id lookup given a coco cats object. """ for coco_cat_id, transformed_cat_id_p1 in get_label_map().items(): @@ -292,115 +495,142 @@ def get_coco_cat(transformed_cat_id): """ transformed_cat_id is [0,80) as indices in cfg.dataset.class_names """ return coco_cats[transformed_cat_id] + def get_transformed_cat(coco_cat_id): """ transformed_cat_id is [0,80) as indices in cfg.dataset.class_names """ return coco_cats_inv[coco_cat_id] class Detections: - def __init__(self): self.bbox_data = [] self.mask_data = [] - def add_bbox(self, image_id:int, category_id:int, bbox:list, score:float): + def add_bbox(self, image_id: int, category_id: int, bbox: list, score: float): """ Note that bbox should be a list or tuple of (x1, y1, x2, y2) """ - bbox = [bbox[0], bbox[1], bbox[2]-bbox[0], bbox[3]-bbox[1]] + bbox = [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]] # Round to the nearest 10th to avoid huge file sizes, as COCO suggests - bbox = [round(float(x)*10)/10 for x in bbox] - - self.bbox_data.append({ - 'image_id': int(image_id), - 'category_id': get_coco_cat(int(category_id)), - 'bbox': bbox, - 'score': float(score) - }) + bbox = [round(float(x) * 10) / 10 for x in bbox] + + self.bbox_data.append( + { + "image_id": int(image_id), + "category_id": get_coco_cat(int(category_id)), + "bbox": bbox, + "score": float(score), + } + ) - def add_mask(self, image_id:int, category_id:int, segmentation:np.ndarray, score:float): + def add_mask( + self, image_id: int, category_id: int, segmentation: np.ndarray, score: float + ): """ The segmentation should be the full mask, the size of the image and with size [h, w]. """ rle = pycocotools.mask.encode(np.asfortranarray(segmentation.astype(np.uint8))) - rle['counts'] = rle['counts'].decode('ascii') # json.dump doesn't like bytes strings - - self.mask_data.append({ - 'image_id': int(image_id), - 'category_id': get_coco_cat(int(category_id)), - 'segmentation': rle, - 'score': float(score) - }) - + rle["counts"] = rle["counts"].decode( + "ascii" + ) # json.dump doesn't like bytes strings + + self.mask_data.append( + { + "image_id": int(image_id), + "category_id": get_coco_cat(int(category_id)), + "segmentation": rle, + "score": float(score), + } + ) + def dump(self): dump_arguments = [ (self.bbox_data, args.bbox_det_file), - (self.mask_data, args.mask_det_file) + (self.mask_data, args.mask_det_file), ] for data, path in dump_arguments: - with open(path, 'w') as f: + with open(path, "w") as f: json.dump(data, f) - + def dump_web(self): """ Dumps it in the format for my web app. Warning: bad code ahead! """ - config_outs = ['preserve_aspect_ratio', 'use_prediction_module', - 'use_yolo_regressors', 'use_prediction_matching', - 'train_masks'] + config_outs = [ + "preserve_aspect_ratio", + "use_prediction_module", + "use_yolo_regressors", + "use_prediction_matching", + "train_masks", + ] - output = { - 'info' : { - 'Config': {key: getattr(cfg, key) for key in config_outs}, - } - } + output = {"info": {"Config": {key: getattr(cfg, key) for key in config_outs},}} - image_ids = list(set([x['image_id'] for x in self.bbox_data])) + image_ids = list(set([x["image_id"] for x in self.bbox_data])) image_ids.sort() image_lookup = {_id: idx for idx, _id in enumerate(image_ids)} - output['images'] = [{'image_id': image_id, 'dets': []} for image_id in image_ids] + output["images"] = [ + {"image_id": image_id, "dets": []} for image_id in image_ids + ] # These should already be sorted by score with the way prep_metrics works. for bbox, mask in zip(self.bbox_data, self.mask_data): - image_obj = output['images'][image_lookup[bbox['image_id']]] - image_obj['dets'].append({ - 'score': bbox['score'], - 'bbox': bbox['bbox'], - 'category': cfg.dataset.class_names[get_transformed_cat(bbox['category_id'])], - 'mask': mask['segmentation'], - }) - - with open(os.path.join(args.web_det_path, '%s.json' % cfg.name), 'w') as f: + image_obj = output["images"][image_lookup[bbox["image_id"]]] + image_obj["dets"].append( + { + "score": bbox["score"], + "bbox": bbox["bbox"], + "category": cfg.dataset.class_names[ + get_transformed_cat(bbox["category_id"]) + ], + "mask": mask["segmentation"], + } + ) + + with open(os.path.join(args.web_det_path, "%s.json" % cfg.name), "w") as f: json.dump(output, f) - - def _mask_iou(mask1, mask2, iscrowd=False): - with timer.env('Mask IoU'): + with timer.env("Mask IoU"): ret = mask_iou(mask1, mask2, iscrowd) return ret.cpu() + def _bbox_iou(bbox1, bbox2, iscrowd=False): - with timer.env('BBox IoU'): + with timer.env("BBox IoU"): ret = jaccard(bbox1, bbox2, iscrowd) return ret.cpu() -def prep_metrics(ap_data, dets, img, gt, gt_masks, h, w, num_crowd, image_id, detections:Detections=None): + +def prep_metrics( + ap_data, + dets, + img, + gt, + gt_masks, + h, + w, + num_crowd, + image_id, + detections: Detections = None, +): """ Returns a list of APs for this image, with each element being for a class """ if not args.output_coco_json: - with timer.env('Prepare gt'): + with timer.env("Prepare gt"): gt_boxes = torch.Tensor(gt[:, :4]) gt_boxes[:, [0, 2]] *= w gt_boxes[:, [1, 3]] *= h gt_classes = list(gt[:, 4].astype(int)) - gt_masks = torch.Tensor(gt_masks).view(-1, h*w) + gt_masks = torch.Tensor(gt_masks).view(-1, h * w) if num_crowd > 0: split = lambda x: (x[-num_crowd:], x[:-num_crowd]) - crowd_boxes , gt_boxes = split(gt_boxes) - crowd_masks , gt_masks = split(gt_masks) + crowd_boxes, gt_boxes = split(gt_boxes) + crowd_masks, gt_masks = split(gt_masks) crowd_classes, gt_classes = split(gt_classes) - with timer.env('Postprocess'): - classes, scores, boxes, masks = postprocess(dets, w, h, crop_masks=args.crop, score_threshold=args.score_threshold) + with timer.env("Postprocess"): + classes, scores, boxes, masks = postprocess( + dets, w, h, crop_masks=args.crop, score_threshold=args.score_threshold + ) if classes.size(0) == 0: return @@ -413,31 +643,36 @@ def prep_metrics(ap_data, dets, img, gt, gt_masks, h, w, num_crowd, image_id, de scores = list(scores.cpu().numpy().astype(float)) box_scores = scores mask_scores = scores - masks = masks.view(-1, h*w).cuda() + masks = masks.view(-1, h * w).cuda() boxes = boxes.cuda() - if args.output_coco_json: - with timer.env('JSON Output'): + with timer.env("JSON Output"): boxes = boxes.cpu().numpy() masks = masks.view(-1, h, w).cpu().numpy() for i in range(masks.shape[0]): # Make sure that the bounding box actually makes sense and a mask was produced if (boxes[i, 3] - boxes[i, 1]) * (boxes[i, 2] - boxes[i, 0]) > 0: - detections.add_bbox(image_id, classes[i], boxes[i,:], box_scores[i]) - detections.add_mask(image_id, classes[i], masks[i,:,:], mask_scores[i]) + detections.add_bbox( + image_id, classes[i], boxes[i, :], box_scores[i] + ) + detections.add_mask( + image_id, classes[i], masks[i, :, :], mask_scores[i] + ) return - - with timer.env('Eval Setup'): + + with timer.env("Eval Setup"): num_pred = len(classes) - num_gt = len(gt_classes) + num_gt = len(gt_classes) mask_iou_cache = _mask_iou(masks, gt_masks) bbox_iou_cache = _bbox_iou(boxes.float(), gt_boxes.float()) if num_crowd > 0: crowd_mask_iou_cache = _mask_iou(masks, crowd_masks, iscrowd=True) - crowd_bbox_iou_cache = _bbox_iou(boxes.float(), crowd_boxes.float(), iscrowd=True) + crowd_bbox_iou_cache = _bbox_iou( + boxes.float(), crowd_boxes.float(), iscrowd=True + ) else: crowd_mask_iou_cache = None crowd_bbox_iou_cache = None @@ -446,44 +681,52 @@ def prep_metrics(ap_data, dets, img, gt, gt_masks, h, w, num_crowd, image_id, de mask_indices = sorted(box_indices, key=lambda i: -mask_scores[i]) iou_types = [ - ('box', lambda i,j: bbox_iou_cache[i, j].item(), - lambda i,j: crowd_bbox_iou_cache[i,j].item(), - lambda i: box_scores[i], box_indices), - ('mask', lambda i,j: mask_iou_cache[i, j].item(), - lambda i,j: crowd_mask_iou_cache[i,j].item(), - lambda i: mask_scores[i], mask_indices) + ( + "box", + lambda i, j: bbox_iou_cache[i, j].item(), + lambda i, j: crowd_bbox_iou_cache[i, j].item(), + lambda i: box_scores[i], + box_indices, + ), + ( + "mask", + lambda i, j: mask_iou_cache[i, j].item(), + lambda i, j: crowd_mask_iou_cache[i, j].item(), + lambda i: mask_scores[i], + mask_indices, + ), ] - timer.start('Main loop') + timer.start("Main loop") for _class in set(classes + gt_classes): ap_per_iou = [] num_gt_for_class = sum([1 for x in gt_classes if x == _class]) - + for iouIdx in range(len(iou_thresholds)): iou_threshold = iou_thresholds[iouIdx] for iou_type, iou_func, crowd_func, score_func, indices in iou_types: gt_used = [False] * len(gt_classes) - + ap_obj = ap_data[iou_type][iouIdx][_class] ap_obj.add_gt_positives(num_gt_for_class) for i in indices: if classes[i] != _class: continue - + max_iou_found = iou_threshold max_match_idx = -1 for j in range(num_gt): if gt_used[j] or gt_classes[j] != _class: continue - + iou = iou_func(i, j) if iou > max_iou_found: max_iou_found = iou max_match_idx = j - + if max_match_idx >= 0: gt_used[max_match_idx] = True ap_obj.push(score_func(i), True) @@ -495,7 +738,7 @@ def prep_metrics(ap_data, dets, img, gt, gt_masks, h, w, num_crowd, image_id, de for j in range(len(crowd_classes)): if crowd_classes[j] != _class: continue - + iou = crowd_func(i, j) if iou > iou_threshold: @@ -507,7 +750,7 @@ def prep_metrics(ap_data, dets, img, gt, gt_masks, h, w, num_crowd, image_id, de # begin with, but accuracy is of the utmost importance. if not matched_crowd: ap_obj.push(score_func(i), False) - timer.stop('Main loop') + timer.stop("Main loop") class APDataObject: @@ -520,10 +763,10 @@ def __init__(self): self.data_points = [] self.num_gt_positives = 0 - def push(self, score:float, is_true:bool): + def push(self, score: float, is_true: bool): self.data_points.append((score, is_true)) - - def add_gt_positives(self, num_positives:int): + + def add_gt_positives(self, num_positives: int): """ Call this once per image. """ self.num_gt_positives += num_positives @@ -540,18 +783,20 @@ def get_ap(self) -> float: self.data_points.sort(key=lambda x: -x[0]) precisions = [] - recalls = [] - num_true = 0 + recalls = [] + num_true = 0 num_false = 0 # Compute the precision-recall curve. The x axis is recalls and the y axis precisions. for datum in self.data_points: # datum[1] is whether the detection a true or false positive - if datum[1]: num_true += 1 - else: num_false += 1 - + if datum[1]: + num_true += 1 + else: + num_false += 1 + precision = num_true / (num_true + num_false) - recall = num_true / self.num_gt_positives + recall = num_true / self.num_gt_positives precisions.append(precision) recalls.append(recall) @@ -559,19 +804,19 @@ def get_ap(self) -> float: # Smooth the curve by computing [max(precisions[i:]) for i in range(len(precisions))] # Basically, remove any temporary dips from the curve. # At least that's what I think, idk. COCOEval did it so I do too. - for i in range(len(precisions)-1, 0, -1): - if precisions[i] > precisions[i-1]: - precisions[i-1] = precisions[i] + for i in range(len(precisions) - 1, 0, -1): + if precisions[i] > precisions[i - 1]: + precisions[i - 1] = precisions[i] # Compute the integral of precision(recall) d_recall from recall=0->1 using fixed-length riemann summation with 101 bars. - y_range = [0] * 101 # idx 0 is recall == 0.0 and idx 100 is recall == 1.00 + y_range = [0] * 101 # idx 0 is recall == 0.0 and idx 100 is recall == 1.00 x_range = np.array([x / 100 for x in range(101)]) recalls = np.array(recalls) # I realize this is weird, but all it does is find the nearest precision(x) for a given x in x_range. # Basically, if the closest recall we have to 0.01 is 0.009 this sets precision(0.01) = precision(0.009). # I approximate the integral this way, because that's how COCOEval does it. - indices = np.searchsorted(recalls, x_range, side='left') + indices = np.searchsorted(recalls, x_range, side="left") for bar_idx, precision_idx in enumerate(indices): if precision_idx < len(precisions): y_range[bar_idx] = precisions[precision_idx] @@ -580,6 +825,7 @@ def get_ap(self) -> float: # avg([precision(x) for x in 0:0.01:1]) return sum(y_range) / len(y_range) + def badhash(x): """ Just a quick and dirty hash function for doing a deterministic shuffle based on image_id. @@ -587,18 +833,19 @@ def badhash(x): Source: https://stackoverflow.com/questions/664014/what-integer-hash-function-are-good-that-accepts-an-integer-hash-key """ - x = (((x >> 16) ^ x) * 0x045d9f3b) & 0xFFFFFFFF - x = (((x >> 16) ^ x) * 0x045d9f3b) & 0xFFFFFFFF - x = ((x >> 16) ^ x) & 0xFFFFFFFF + x = (((x >> 16) ^ x) * 0x045D9F3B) & 0xFFFFFFFF + x = (((x >> 16) ^ x) * 0x045D9F3B) & 0xFFFFFFFF + x = ((x >> 16) ^ x) & 0xFFFFFFFF return x -def evalimage(net:Yolact, path:str, save_path:str=None): + +def evalimage(net: Yolact, path: str, save_path: str = None): frame = torch.from_numpy(cv2.imread(path)).cuda().float() batch = FastBaseTransform()(frame.unsqueeze(0)) preds = net(batch) img_numpy = prep_display(preds, frame, None, None, undo_transform=False) - + if save_path is None: img_numpy = img_numpy[:, :, (2, 1, 0)] @@ -609,52 +856,57 @@ def evalimage(net:Yolact, path:str, save_path:str=None): else: cv2.imwrite(save_path, img_numpy) -def evalimages(net:Yolact, input_folder:str, output_folder:str): + +def evalimages(net: Yolact, input_folder: str, output_folder: str): if not os.path.exists(output_folder): os.mkdir(output_folder) print() - for p in Path(input_folder).glob('*'): + for p in Path(input_folder).glob("*"): path = str(p) name = os.path.basename(path) - name = '.'.join(name.split('.')[:-1]) + '.png' + name = ".".join(name.split(".")[:-1]) + ".png" out_path = os.path.join(output_folder, name) evalimage(net, path, out_path) - print(path + ' -> ' + out_path) - print('Done.') + print(path + " -> " + out_path) + print("Done.") + from multiprocessing.pool import ThreadPool from queue import Queue + class CustomDataParallel(torch.nn.DataParallel): """ A Custom Data Parallel class that properly gathers lists of dictionaries. """ + def gather(self, outputs, output_device): # Note that I don't actually want to convert everything to the output_device return sum(outputs, []) -def evalvideo(net:Yolact, path:str, out_path:str=None): + +def evalvideo(net: Yolact, path: str, out_path: str = None): # If the path is a digit, parse it as a webcam index is_webcam = path.isdigit() - + # If the input image size is constant, this make things faster (hence why we can use it in a video setting). cudnn.benchmark = True - + if is_webcam: vid = cv2.VideoCapture(int(path)) else: vid = cv2.VideoCapture(path) - + if not vid.isOpened(): print('Could not open video "%s"' % path) exit(-1) - target_fps = round(vid.get(cv2.CAP_PROP_FPS)) - frame_width = round(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) + target_fps = round(vid.get(cv2.CAP_PROP_FPS)) + frame_width = round(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) frame_height = round(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) - + if is_webcam: - num_frames = float('inf') + num_frames = float("inf") else: num_frames = round(vid.get(cv2.CAP_PROP_FRAME_COUNT)) @@ -664,12 +916,17 @@ def evalvideo(net:Yolact, path:str, out_path:str=None): fps = 0 frame_time_target = 1 / target_fps running = True - fps_str = '' + fps_str = "" vid_done = False frames_displayed = 0 if out_path is not None: - out = cv2.VideoWriter(out_path, cv2.VideoWriter_fourcc(*"mp4v"), target_fps, (frame_width, frame_height)) + out = cv2.VideoWriter( + out_path, + cv2.VideoWriter_fourcc(*"mp4v"), + target_fps, + (frame_width, frame_height), + ) def cleanup_and_exit(): print() @@ -709,12 +966,20 @@ def eval_network(inp): def prep_frame(inp, fps_str): with torch.no_grad(): frame, preds = inp - return prep_display(preds, frame, None, None, undo_transform=False, class_color=True, fps_str=fps_str) + return prep_display( + preds, + frame, + None, + None, + undo_transform=False, + class_color=True, + fps_str=fps_str, + ) frame_buffer = Queue() video_fps = 0 - # All this timing code to make sure that + # All this timing code to make sure that def play_video(): try: nonlocal frame_buffer, running, video_fps, is_webcam, num_frames, frames_displayed, vid_done @@ -748,10 +1013,18 @@ def play_video(): progress = frames_displayed / num_frames * 100 progress_bar.set_val(frames_displayed) - print('\rProcessing Frames %s %6d / %6d (%5.2f%%) %5.2f fps ' - % (repr(progress_bar), frames_displayed, num_frames, progress, fps), end='') + print( + "\rProcessing Frames %s %6d / %6d (%5.2f%%) %5.2f fps " + % ( + repr(progress_bar), + frames_displayed, + num_frames, + progress, + fps, + ), + end="", + ) - # This is split because you don't want savevideo to require cv2 display functionality (see #197) if out_path is None and cv2.waitKey(1) == 27: # Press Escape to close @@ -768,13 +1041,19 @@ def play_video(): if frame_time_stabilizer < 0: frame_time_stabilizer = 0 - new_target = frame_time_stabilizer if is_webcam else max(frame_time_stabilizer, frame_time_target) + new_target = ( + frame_time_stabilizer + if is_webcam + else max(frame_time_stabilizer, frame_time_target) + ) else: new_target = frame_time_target next_frame_target = max(2 * new_target - video_frame_times.get_avg(), 0) - target_time = frame_time_start + next_frame_target - 0.001 # Let's just subtract a millisecond to be safe - + target_time = ( + frame_time_start + next_frame_target - 0.001 + ) # Let's just subtract a millisecond to be safe + if out_path is None or args.emulate_playback: # This gives more accurate timing than if sleeping the whole amount at once while time.time() < target_time: @@ -785,24 +1064,33 @@ def play_video(): except: # See issue #197 for why this is necessary import traceback - traceback.print_exc() + traceback.print_exc() - extract_frame = lambda x, i: (x[0][i] if x[1][i]['detection'] is None else x[0][i].to(x[1][i]['detection']['box'].device), [x[1][i]]) + extract_frame = lambda x, i: ( + x[0][i] + if x[1][i]["detection"] is None + else x[0][i].to(x[1][i]["detection"]["box"].device), + [x[1][i]], + ) # Prime the network on the first frame because I do some thread unsafe things otherwise - print('Initializing model... ', end='') + print("Initializing model... ", end="") first_batch = eval_network(transform_frame(get_next_frame(vid))) - print('Done.') + print("Done.") # For each frame the sequence of functions it needs to go through to be processed (in reversed order) sequence = [prep_frame, eval_network, transform_frame] pool = ThreadPool(processes=len(sequence) + args.video_multiframe + 2) pool.apply_async(play_video) - active_frames = [{'value': extract_frame(first_batch, i), 'idx': 0} for i in range(len(first_batch[0]))] + active_frames = [ + {"value": extract_frame(first_batch, i), "idx": 0} + for i in range(len(first_batch[0])) + ] print() - if out_path is None: print('Press Escape to close.') + if out_path is None: + print("Press Escape to close.") try: while vid.isOpened() and running: # Hard limit on frames in buffer so we don't run out of memory >.> @@ -816,84 +1104,97 @@ def play_video(): next_frames = pool.apply_async(get_next_frame, args=(vid,)) else: next_frames = None - + if not (vid_done and len(active_frames) == 0): # For each frame in our active processing queue, dispatch a job # for that frame using the current function in the sequence for frame in active_frames: - _args = [frame['value']] - if frame['idx'] == 0: + _args = [frame["value"]] + if frame["idx"] == 0: _args.append(fps_str) - frame['value'] = pool.apply_async(sequence[frame['idx']], args=_args) - + frame["value"] = pool.apply_async( + sequence[frame["idx"]], args=_args + ) + # For each frame whose job was the last in the sequence (i.e. for all final outputs) for frame in active_frames: - if frame['idx'] == 0: - frame_buffer.put(frame['value'].get()) + if frame["idx"] == 0: + frame_buffer.put(frame["value"].get()) # Remove the finished frames from the processing queue - active_frames = [x for x in active_frames if x['idx'] > 0] + active_frames = [x for x in active_frames if x["idx"] > 0] # Finish evaluating every frame in the processing queue and advanced their position in the sequence for frame in list(reversed(active_frames)): - frame['value'] = frame['value'].get() - frame['idx'] -= 1 + frame["value"] = frame["value"].get() + frame["idx"] -= 1 - if frame['idx'] == 0: + if frame["idx"] == 0: # Split this up into individual threads for prep_frame since it doesn't support batch size - active_frames += [{'value': extract_frame(frame['value'], i), 'idx': 0} for i in range(1, len(frame['value'][0]))] - frame['value'] = extract_frame(frame['value'], 0) - + active_frames += [ + {"value": extract_frame(frame["value"], i), "idx": 0} + for i in range(1, len(frame["value"][0])) + ] + frame["value"] = extract_frame(frame["value"], 0) + # Finish loading in the next frames and add them to the processing queue if next_frames is not None: frames = next_frames.get() if len(frames) == 0: vid_done = True else: - active_frames.append({'value': frames, 'idx': len(sequence)-1}) + active_frames.append( + {"value": frames, "idx": len(sequence) - 1} + ) # Compute FPS frame_times.add(time.time() - start_time) fps = args.video_multiframe / frame_times.get_avg() else: fps = 0 - - fps_str = 'Processing FPS: %.2f | Video Playback FPS: %.2f | Frames in Buffer: %d' % (fps, video_fps, frame_buffer.qsize()) + + fps_str = ( + "Processing FPS: %.2f | Video Playback FPS: %.2f | Frames in Buffer: %d" + % (fps, video_fps, frame_buffer.qsize()) + ) if not args.display_fps: - print('\r' + fps_str + ' ', end='') + print("\r" + fps_str + " ", end="") except KeyboardInterrupt: - print('\nStopping...') - + print("\nStopping...") + cleanup_and_exit() -def evaluate(net:Yolact, dataset, train_mode=False): + +def evaluate(net: Yolact, dataset, train_mode=False): net.detect.use_fast_nms = args.fast_nms net.detect.use_cross_class_nms = args.cross_class_nms cfg.mask_proto_debug = args.mask_proto_debug # TODO Currently we do not support Fast Mask Re-scroing in evalimage, evalimages, and evalvideo if args.image is not None: - if ':' in args.image: - inp, out = args.image.split(':') + if ":" in args.image: + inp, out = args.image.split(":") evalimage(net, inp, out) else: evalimage(net, args.image) return elif args.images is not None: - inp, out = args.images.split(':') + inp, out = args.images.split(":") evalimages(net, inp, out) return elif args.video is not None: - if ':' in args.video: - inp, out = args.video.split(':') + if ":" in args.video: + inp, out = args.video.split(":") evalvideo(net, inp, out) else: evalvideo(net, args.video) return frame_times = MovingAverage() - dataset_size = len(dataset) if args.max_images < 0 else min(args.max_images, len(dataset)) + dataset_size = ( + len(dataset) if args.max_images < 0 else min(args.max_images, len(dataset)) + ) progress_bar = ProgressBar(30, dataset_size) print() @@ -902,15 +1203,21 @@ def evaluate(net:Yolact, dataset, train_mode=False): # For each class and iou, stores tuples (score, isPositive) # Index ap_data[type][iouIdx][classIdx] ap_data = { - 'box' : [[APDataObject() for _ in cfg.dataset.class_names] for _ in iou_thresholds], - 'mask': [[APDataObject() for _ in cfg.dataset.class_names] for _ in iou_thresholds] + "box": [ + [APDataObject() for _ in cfg.dataset.class_names] + for _ in iou_thresholds + ], + "mask": [ + [APDataObject() for _ in cfg.dataset.class_names] + for _ in iou_thresholds + ], } detections = Detections() else: - timer.disable('Load Data') + timer.disable("Load Data") dataset_indices = list(range(len(dataset))) - + if args.shuffle: random.shuffle(dataset_indices) elif not args.no_sort: @@ -932,20 +1239,20 @@ def evaluate(net:Yolact, dataset, train_mode=False): for it, image_idx in enumerate(dataset_indices): timer.reset() - with timer.env('Load Data'): + with timer.env("Load Data"): img, gt, gt_masks, h, w, num_crowd = dataset.pull_item(image_idx) # Test flag, do not upvote if cfg.mask_proto_debug: - with open('scripts/info.txt', 'w') as f: + with open("scripts/info.txt", "w") as f: f.write(str(dataset.ids[image_idx])) - np.save('scripts/gt.npy', gt_masks) + np.save("scripts/gt.npy", gt_masks) batch = Variable(img.unsqueeze(0)) if args.cuda: batch = batch.cuda() - with timer.env('Network Extra'): + with timer.env("Network Extra"): preds = net(batch) # Perform the meat of the operation here depending on our mode. if args.display: @@ -953,115 +1260,154 @@ def evaluate(net:Yolact, dataset, train_mode=False): elif args.benchmark: prep_benchmark(preds, h, w) else: - prep_metrics(ap_data, preds, img, gt, gt_masks, h, w, num_crowd, dataset.ids[image_idx], detections) - + prep_metrics( + ap_data, + preds, + img, + gt, + gt_masks, + h, + w, + num_crowd, + dataset.ids[image_idx], + detections, + ) + # First couple of images take longer because we're constructing the graph. # Since that's technically initialization, don't include those in the FPS calculations. if it > 1: frame_times.add(timer.total_time()) - + if args.display: if it > 1: - print('Avg FPS: %.4f' % (1 / frame_times.get_avg())) + print("Avg FPS: %.4f" % (1 / frame_times.get_avg())) plt.imshow(img_numpy) plt.title(str(dataset.ids[image_idx])) plt.show() elif not args.no_bar: - if it > 1: fps = 1 / frame_times.get_avg() - else: fps = 0 - progress = (it+1) / dataset_size * 100 - progress_bar.set_val(it+1) - print('\rProcessing Images %s %6d / %6d (%5.2f%%) %5.2f fps ' - % (repr(progress_bar), it+1, dataset_size, progress, fps), end='') - - + if it > 1: + fps = 1 / frame_times.get_avg() + else: + fps = 0 + progress = (it + 1) / dataset_size * 100 + progress_bar.set_val(it + 1) + print( + "\rProcessing Images %s %6d / %6d (%5.2f%%) %5.2f fps " + % (repr(progress_bar), it + 1, dataset_size, progress, fps), + end="", + ) if not args.display and not args.benchmark: print() if args.output_coco_json: - print('Dumping detections...') + print("Dumping detections...") if args.output_web_json: detections.dump_web() else: detections.dump() else: if not train_mode: - print('Saving data...') - with open(args.ap_data_file, 'wb') as f: + print("Saving data...") + with open(args.ap_data_file, "wb") as f: pickle.dump(ap_data, f) return calc_map(ap_data) elif args.benchmark: print() print() - print('Stats for the last frame:') + print("Stats for the last frame:") timer.print_stats() avg_seconds = frame_times.get_avg() - print('Average: %5.2f fps, %5.2f ms' % (1 / frame_times.get_avg(), 1000*avg_seconds)) + print( + "Average: %5.2f fps, %5.2f ms" + % (1 / frame_times.get_avg(), 1000 * avg_seconds) + ) except KeyboardInterrupt: - print('Stopping...') + print("Stopping...") def calc_map(ap_data): - print('Calculating mAP...') - aps = [{'box': [], 'mask': []} for _ in iou_thresholds] + print("Calculating mAP...") + aps = [{"box": [], "mask": []} for _ in iou_thresholds] for _class in range(len(cfg.dataset.class_names)): for iou_idx in range(len(iou_thresholds)): - for iou_type in ('box', 'mask'): + for iou_type in ("box", "mask"): ap_obj = ap_data[iou_type][iou_idx][_class] if not ap_obj.is_empty(): aps[iou_idx][iou_type].append(ap_obj.get_ap()) - all_maps = {'box': OrderedDict(), 'mask': OrderedDict()} + all_maps = {"box": OrderedDict(), "mask": OrderedDict()} # Looking back at it, this code is really hard to read :/ - for iou_type in ('box', 'mask'): - all_maps[iou_type]['all'] = 0 # Make this first in the ordereddict + for iou_type in ("box", "mask"): + all_maps[iou_type]["all"] = 0 # Make this first in the ordereddict for i, threshold in enumerate(iou_thresholds): - mAP = sum(aps[i][iou_type]) / len(aps[i][iou_type]) * 100 if len(aps[i][iou_type]) > 0 else 0 - all_maps[iou_type][int(threshold*100)] = mAP - all_maps[iou_type]['all'] = (sum(all_maps[iou_type].values()) / (len(all_maps[iou_type].values())-1)) - + mAP = ( + sum(aps[i][iou_type]) / len(aps[i][iou_type]) * 100 + if len(aps[i][iou_type]) > 0 + else 0 + ) + all_maps[iou_type][int(threshold * 100)] = mAP + all_maps[iou_type]["all"] = sum(all_maps[iou_type].values()) / ( + len(all_maps[iou_type].values()) - 1 + ) + print_maps(all_maps) - + # Put in a prettier format so we can serialize it to json during training all_maps = {k: {j: round(u, 2) for j, u in v.items()} for k, v in all_maps.items()} return all_maps + def print_maps(all_maps): - # Warning: hacky - make_row = lambda vals: (' %5s |' * len(vals)) % tuple(vals) - make_sep = lambda n: ('-------+' * n) + # Warning: hacky + make_row = lambda vals: (" %5s |" * len(vals)) % tuple(vals) + make_sep = lambda n: ("-------+" * n) print() - print(make_row([''] + [('.%d ' % x if isinstance(x, int) else x + ' ') for x in all_maps['box'].keys()])) - print(make_sep(len(all_maps['box']) + 1)) - for iou_type in ('box', 'mask'): - print(make_row([iou_type] + ['%.2f' % x if x < 100 else '%.1f' % x for x in all_maps[iou_type].values()])) - print(make_sep(len(all_maps['box']) + 1)) + print( + make_row( + [""] + + [ + (".%d " % x if isinstance(x, int) else x + " ") + for x in all_maps["box"].keys() + ] + ) + ) + print(make_sep(len(all_maps["box"]) + 1)) + for iou_type in ("box", "mask"): + print( + make_row( + [iou_type] + + [ + "%.2f" % x if x < 100 else "%.1f" % x + for x in all_maps[iou_type].values() + ] + ) + ) + print(make_sep(len(all_maps["box"]) + 1)) print() - -if __name__ == '__main__': +if __name__ == "__main__": parse_args() if args.config is not None: set_cfg(args.config) - if args.trained_model == 'interrupt': - args.trained_model = SavePath.get_interrupt('weights/') - elif args.trained_model == 'latest': - args.trained_model = SavePath.get_latest('weights/', cfg.name) + if args.trained_model == "interrupt": + args.trained_model = SavePath.get_interrupt("weights/") + elif args.trained_model == "latest": + args.trained_model = SavePath.get_latest("weights/", cfg.name) if args.config is None: model_path = SavePath.from_str(args.trained_model) # TODO: Bad practice? Probably want to do a name lookup instead. - args.config = model_path.model_name + '_config' - print('Config not specified. Parsed %s from the file name.\n' % args.config) + args.config = model_path.model_name + "_config" + print("Config not specified. Parsed %s from the file name.\n" % args.config) set_cfg(args.config) if args.detect: @@ -1071,37 +1417,39 @@ def print_maps(all_maps): set_dataset(args.dataset) with torch.no_grad(): - if not os.path.exists('results'): - os.makedirs('results') + if not os.path.exists("results"): + os.makedirs("results") if args.cuda: cudnn.fastest = True - torch.set_default_tensor_type('torch.cuda.FloatTensor') + torch.set_default_tensor_type("torch.cuda.FloatTensor") else: - torch.set_default_tensor_type('torch.FloatTensor') + torch.set_default_tensor_type("torch.FloatTensor") if args.resume and not args.display: - with open(args.ap_data_file, 'rb') as f: + with open(args.ap_data_file, "rb") as f: ap_data = pickle.load(f) calc_map(ap_data) exit() if args.image is None and args.video is None and args.images is None: - dataset = COCODetection(cfg.dataset.valid_images, cfg.dataset.valid_info, - transform=BaseTransform(), has_gt=cfg.dataset.has_gt) + dataset = COCODetection( + cfg.dataset.valid_images, + cfg.dataset.valid_info, + transform=BaseTransform(), + has_gt=cfg.dataset.has_gt, + ) prep_coco_cats() else: - dataset = None + dataset = None - print('Loading model...', end='') + print("Loading model...", end="") net = Yolact() net.load_weights(args.trained_model) net.eval() - print(' Done.') + print(" Done.") if args.cuda: net = net.cuda() evaluate(net, dataset) - - diff --git a/external/DCNv2/dcn_v2.py b/external/DCNv2/dcn_v2.py index 982bef512..885c8898f 100644 --- a/external/DCNv2/dcn_v2.py +++ b/external/DCNv2/dcn_v2.py @@ -15,20 +15,39 @@ class _DCNv2(Function): @staticmethod - def forward(ctx, input, offset, mask, weight, bias, - stride, padding, dilation, deformable_groups): + def forward( + ctx, + input, + offset, + mask, + weight, + bias, + stride, + padding, + dilation, + deformable_groups, + ): ctx.stride = _pair(stride) ctx.padding = _pair(padding) ctx.dilation = _pair(dilation) ctx.kernel_size = _pair(weight.shape[2:4]) ctx.deformable_groups = deformable_groups - output = _backend.dcn_v2_forward(input, weight, bias, - offset, mask, - ctx.kernel_size[0], ctx.kernel_size[1], - ctx.stride[0], ctx.stride[1], - ctx.padding[0], ctx.padding[1], - ctx.dilation[0], ctx.dilation[1], - ctx.deformable_groups) + output = _backend.dcn_v2_forward( + input, + weight, + bias, + offset, + mask, + ctx.kernel_size[0], + ctx.kernel_size[1], + ctx.stride[0], + ctx.stride[1], + ctx.padding[0], + ctx.padding[1], + ctx.dilation[0], + ctx.dilation[1], + ctx.deformable_groups, + ) ctx.save_for_backward(input, offset, mask, weight, bias) return output @@ -36,28 +55,57 @@ def forward(ctx, input, offset, mask, weight, bias, @once_differentiable def backward(ctx, grad_output): input, offset, mask, weight, bias = ctx.saved_tensors - grad_input, grad_offset, grad_mask, grad_weight, grad_bias = \ - _backend.dcn_v2_backward(input, weight, - bias, - offset, mask, - grad_output, - ctx.kernel_size[0], ctx.kernel_size[1], - ctx.stride[0], ctx.stride[1], - ctx.padding[0], ctx.padding[1], - ctx.dilation[0], ctx.dilation[1], - ctx.deformable_groups) - - return grad_input, grad_offset, grad_mask, grad_weight, grad_bias,\ - None, None, None, None, + ( + grad_input, + grad_offset, + grad_mask, + grad_weight, + grad_bias, + ) = _backend.dcn_v2_backward( + input, + weight, + bias, + offset, + mask, + grad_output, + ctx.kernel_size[0], + ctx.kernel_size[1], + ctx.stride[0], + ctx.stride[1], + ctx.padding[0], + ctx.padding[1], + ctx.dilation[0], + ctx.dilation[1], + ctx.deformable_groups, + ) + + return ( + grad_input, + grad_offset, + grad_mask, + grad_weight, + grad_bias, + None, + None, + None, + None, + ) dcn_v2_conv = _DCNv2.apply class DCNv2(nn.Module): - - def __init__(self, in_channels, out_channels, - kernel_size, stride, padding, dilation=1, deformable_groups=1): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation=1, + deformable_groups=1, + ): super(DCNv2, self).__init__() self.in_channels = in_channels self.out_channels = out_channels @@ -67,8 +115,9 @@ def __init__(self, in_channels, out_channels, self.dilation = _pair(dilation) self.deformable_groups = deformable_groups - self.weight = nn.Parameter(torch.Tensor( - out_channels, in_channels, *self.kernel_size)) + self.weight = nn.Parameter( + torch.Tensor(out_channels, in_channels, *self.kernel_size) + ) self.bias = nn.Parameter(torch.Tensor(out_channels)) self.reset_parameters() @@ -76,39 +125,64 @@ def reset_parameters(self): n = self.in_channels for k in self.kernel_size: n *= k - stdv = 1. / math.sqrt(n) + stdv = 1.0 / math.sqrt(n) self.weight.data.uniform_(-stdv, stdv) self.bias.data.zero_() def forward(self, input, offset, mask): - assert 2 * self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \ - offset.shape[1] - assert self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \ - mask.shape[1] - return dcn_v2_conv(input, offset, mask, - self.weight, - self.bias, - self.stride, - self.padding, - self.dilation, - self.deformable_groups) + assert ( + 2 * self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] + == offset.shape[1] + ) + assert ( + self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] + == mask.shape[1] + ) + return dcn_v2_conv( + input, + offset, + mask, + self.weight, + self.bias, + self.stride, + self.padding, + self.dilation, + self.deformable_groups, + ) class DCN(DCNv2): - - def __init__(self, in_channels, out_channels, - kernel_size, stride, padding, - dilation=1, deformable_groups=1): - super(DCN, self).__init__(in_channels, out_channels, - kernel_size, stride, padding, dilation, deformable_groups) - - channels_ = self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1] - self.conv_offset_mask = nn.Conv2d(self.in_channels, - channels_, - kernel_size=self.kernel_size, - stride=self.stride, - padding=self.padding, - bias=True) + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation=1, + deformable_groups=1, + ): + super(DCN, self).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + deformable_groups, + ) + + channels_ = ( + self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1] + ) + self.conv_offset_mask = nn.Conv2d( + self.in_channels, + channels_, + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding, + bias=True, + ) self.init_offset() def init_offset(self): @@ -120,26 +194,35 @@ def forward(self, input): o1, o2, mask = torch.chunk(out, 3, dim=1) offset = torch.cat((o1, o2), dim=1) mask = torch.sigmoid(mask) - return dcn_v2_conv(input, offset, mask, - self.weight, self.bias, - self.stride, - self.padding, - self.dilation, - self.deformable_groups) - + return dcn_v2_conv( + input, + offset, + mask, + self.weight, + self.bias, + self.stride, + self.padding, + self.dilation, + self.deformable_groups, + ) class _DCNv2Pooling(Function): @staticmethod - def forward(ctx, input, rois, offset, - spatial_scale, - pooled_size, - output_dim, - no_trans, - group_size=1, - part_size=None, - sample_per_part=4, - trans_std=.0): + def forward( + ctx, + input, + rois, + offset, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=0.0, + ): ctx.spatial_scale = spatial_scale ctx.no_trans = int(no_trans) ctx.output_dim = output_dim @@ -149,12 +232,19 @@ def forward(ctx, input, rois, offset, ctx.sample_per_part = sample_per_part ctx.trans_std = trans_std - output, output_count = \ - _backend.dcn_v2_psroi_pooling_forward(input, rois, offset, - ctx.no_trans, ctx.spatial_scale, - ctx.output_dim, ctx.group_size, - ctx.pooled_size, ctx.part_size, - ctx.sample_per_part, ctx.trans_std) + output, output_count = _backend.dcn_v2_psroi_pooling_forward( + input, + rois, + offset, + ctx.no_trans, + ctx.spatial_scale, + ctx.output_dim, + ctx.group_size, + ctx.pooled_size, + ctx.part_size, + ctx.sample_per_part, + ctx.trans_std, + ) ctx.save_for_backward(input, rois, offset, output_count) return output @@ -162,39 +252,52 @@ def forward(ctx, input, rois, offset, @once_differentiable def backward(ctx, grad_output): input, rois, offset, output_count = ctx.saved_tensors - grad_input, grad_offset = \ - _backend.dcn_v2_psroi_pooling_backward(grad_output, - input, - rois, - offset, - output_count, - ctx.no_trans, - ctx.spatial_scale, - ctx.output_dim, - ctx.group_size, - ctx.pooled_size, - ctx.part_size, - ctx.sample_per_part, - ctx.trans_std) - - return grad_input, None, grad_offset, \ - None, None, None, None, None, None, None, None + grad_input, grad_offset = _backend.dcn_v2_psroi_pooling_backward( + grad_output, + input, + rois, + offset, + output_count, + ctx.no_trans, + ctx.spatial_scale, + ctx.output_dim, + ctx.group_size, + ctx.pooled_size, + ctx.part_size, + ctx.sample_per_part, + ctx.trans_std, + ) + + return ( + grad_input, + None, + grad_offset, + None, + None, + None, + None, + None, + None, + None, + None, + ) dcn_v2_pooling = _DCNv2Pooling.apply class DCNv2Pooling(nn.Module): - - def __init__(self, - spatial_scale, - pooled_size, - output_dim, - no_trans, - group_size=1, - part_size=None, - sample_per_part=4, - trans_std=.0): + def __init__( + self, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=0.0, + ): super(DCNv2Pooling, self).__init__() self.spatial_scale = spatial_scale self.pooled_size = pooled_size @@ -209,49 +312,57 @@ def forward(self, input, rois, offset): assert input.shape[1] == self.output_dim if self.no_trans: offset = input.new() - return dcn_v2_pooling(input, rois, offset, - self.spatial_scale, - self.pooled_size, - self.output_dim, - self.no_trans, - self.group_size, - self.part_size, - self.sample_per_part, - self.trans_std) + return dcn_v2_pooling( + input, + rois, + offset, + self.spatial_scale, + self.pooled_size, + self.output_dim, + self.no_trans, + self.group_size, + self.part_size, + self.sample_per_part, + self.trans_std, + ) class DCNPooling(DCNv2Pooling): - - def __init__(self, - spatial_scale, - pooled_size, - output_dim, - no_trans, - group_size=1, - part_size=None, - sample_per_part=4, - trans_std=.0, - deform_fc_dim=1024): - super(DCNPooling, self).__init__(spatial_scale, - pooled_size, - output_dim, - no_trans, - group_size, - part_size, - sample_per_part, - trans_std) + def __init__( + self, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=0.0, + deform_fc_dim=1024, + ): + super(DCNPooling, self).__init__( + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size, + part_size, + sample_per_part, + trans_std, + ) self.deform_fc_dim = deform_fc_dim if not no_trans: self.offset_mask_fc = nn.Sequential( - nn.Linear(self.pooled_size * self.pooled_size * - self.output_dim, self.deform_fc_dim), + nn.Linear( + self.pooled_size * self.pooled_size * self.output_dim, + self.deform_fc_dim, + ), nn.ReLU(inplace=True), nn.Linear(self.deform_fc_dim, self.deform_fc_dim), nn.ReLU(inplace=True), - nn.Linear(self.deform_fc_dim, self.pooled_size * - self.pooled_size * 3) + nn.Linear(self.deform_fc_dim, self.pooled_size * self.pooled_size * 3), ) self.offset_mask_fc[4].weight.data.zero_() self.offset_mask_fc[4].bias.data.zero_() @@ -263,41 +374,55 @@ def forward(self, input, rois): # do roi_align first n = rois.shape[0] - roi = dcn_v2_pooling(input, rois, offset, - self.spatial_scale, - self.pooled_size, - self.output_dim, - True, # no trans - self.group_size, - self.part_size, - self.sample_per_part, - self.trans_std) + roi = dcn_v2_pooling( + input, + rois, + offset, + self.spatial_scale, + self.pooled_size, + self.output_dim, + True, # no trans + self.group_size, + self.part_size, + self.sample_per_part, + self.trans_std, + ) # build mask and offset offset_mask = self.offset_mask_fc(roi.view(n, -1)) - offset_mask = offset_mask.view( - n, 3, self.pooled_size, self.pooled_size) + offset_mask = offset_mask.view(n, 3, self.pooled_size, self.pooled_size) o1, o2, mask = torch.chunk(offset_mask, 3, dim=1) offset = torch.cat((o1, o2), dim=1) mask = torch.sigmoid(mask) # do pooling with offset and mask - return dcn_v2_pooling(input, rois, offset, - self.spatial_scale, - self.pooled_size, - self.output_dim, - self.no_trans, - self.group_size, - self.part_size, - self.sample_per_part, - self.trans_std) * mask + return ( + dcn_v2_pooling( + input, + rois, + offset, + self.spatial_scale, + self.pooled_size, + self.output_dim, + self.no_trans, + self.group_size, + self.part_size, + self.sample_per_part, + self.trans_std, + ) + * mask + ) # only roi_align - return dcn_v2_pooling(input, rois, offset, - self.spatial_scale, - self.pooled_size, - self.output_dim, - self.no_trans, - self.group_size, - self.part_size, - self.sample_per_part, - self.trans_std) + return dcn_v2_pooling( + input, + rois, + offset, + self.spatial_scale, + self.pooled_size, + self.output_dim, + self.no_trans, + self.group_size, + self.part_size, + self.sample_per_part, + self.trans_std, + ) diff --git a/external/DCNv2/setup.py b/external/DCNv2/setup.py index 571b5365c..d79708378 100644 --- a/external/DCNv2/setup.py +++ b/external/DCNv2/setup.py @@ -14,6 +14,7 @@ requirements = ["torch", "torchvision"] + def get_extensions(): this_dir = os.path.dirname(os.path.abspath(__file__)) extensions_dir = os.path.join(this_dir, "src") @@ -38,7 +39,7 @@ def get_extensions(): "-D__CUDA_NO_HALF2_OPERATORS__", ] else: - raise NotImplementedError('Cuda is not available') + raise NotImplementedError("Cuda is not available") sources = [os.path.join(extensions_dir, s) for s in sources] include_dirs = [extensions_dir] @@ -53,6 +54,7 @@ def get_extensions(): ] return ext_modules + setup( name="DCNv2", version="0.1", diff --git a/external/DCNv2/test.py b/external/DCNv2/test.py index 3bd5bd223..279b5797e 100644 --- a/external/DCNv2/test.py +++ b/external/DCNv2/test.py @@ -21,8 +21,8 @@ def conv_identify(weight, bias): weight.data.zero_() bias.data.zero_() o, i, h, w = weight.shape - y = h//2 - x = w//2 + y = h // 2 + x = w // 2 for p in range(i): for q in range(o): if p == q: @@ -30,21 +30,33 @@ def conv_identify(weight, bias): def check_zero_offset(): - conv_offset = nn.Conv2d(inC, deformable_groups * 2 * kH * kW, - kernel_size=(kH, kW), - stride=(1, 1), - padding=(1, 1), - bias=True).cuda() - - conv_mask = nn.Conv2d(inC, deformable_groups * 1 * kH * kW, - kernel_size=(kH, kW), - stride=(1, 1), - padding=(1, 1), - bias=True).cuda() - - dcn_v2 = DCNv2(inC, outC, (kH, kW), - stride=1, padding=1, dilation=1, - deformable_groups=deformable_groups).cuda() + conv_offset = nn.Conv2d( + inC, + deformable_groups * 2 * kH * kW, + kernel_size=(kH, kW), + stride=(1, 1), + padding=(1, 1), + bias=True, + ).cuda() + + conv_mask = nn.Conv2d( + inC, + deformable_groups * 1 * kH * kW, + kernel_size=(kH, kW), + stride=(1, 1), + padding=(1, 1), + bias=True, + ).cuda() + + dcn_v2 = DCNv2( + inC, + outC, + (kH, kW), + stride=1, + padding=1, + dilation=1, + deformable_groups=deformable_groups, + ).cuda() conv_offset.weight.data.zero_() conv_offset.bias.data.zero_() @@ -60,12 +72,13 @@ def check_zero_offset(): output *= 2 d = (input - output).abs().max() if d < 1e-10: - print('Zero offset passed') + print("Zero offset passed") else: - print('Zero offset failed') + print("Zero offset failed") print(input) print(output) + def check_gradient_dconv(): input = torch.rand(N, inC, inH, inW).cuda() * 0.01 @@ -91,43 +104,58 @@ def check_gradient_dconv(): padding = 1 dilation = 1 - print('check_gradient_dconv: ', - gradcheck(dcn_v2_conv, (input, offset, mask, weight, bias, - stride, padding, dilation, deformable_groups), - eps=1e-3, atol=1e-4, rtol=1e-2)) + print( + "check_gradient_dconv: ", + gradcheck( + dcn_v2_conv, + ( + input, + offset, + mask, + weight, + bias, + stride, + padding, + dilation, + deformable_groups, + ), + eps=1e-3, + atol=1e-4, + rtol=1e-2, + ), + ) def check_pooling_zero_offset(): input = torch.randn(2, 16, 64, 64).cuda().zero_() - input[0, :, 16:26, 16:26] = 1. - input[1, :, 10:20, 20:30] = 2. - rois = torch.tensor([ - [0, 65, 65, 103, 103], - [1, 81, 41, 119, 79], - ]).cuda().float() - pooling = DCNv2Pooling(spatial_scale=1.0 / 4, - pooled_size=7, - output_dim=16, - no_trans=True, - group_size=1, - trans_std=0.0).cuda() + input[0, :, 16:26, 16:26] = 1.0 + input[1, :, 10:20, 20:30] = 2.0 + rois = torch.tensor([[0, 65, 65, 103, 103], [1, 81, 41, 119, 79],]).cuda().float() + pooling = DCNv2Pooling( + spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=16, + no_trans=True, + group_size=1, + trans_std=0.0, + ).cuda() out = pooling(input, rois, input.new()) - s = ', '.join(['%f' % out[i, :, :, :].mean().item() - for i in range(rois.shape[0])]) + s = ", ".join(["%f" % out[i, :, :, :].mean().item() for i in range(rois.shape[0])]) print(s) - dpooling = DCNv2Pooling(spatial_scale=1.0 / 4, - pooled_size=7, - output_dim=16, - no_trans=False, - group_size=1, - trans_std=0.0).cuda() + dpooling = DCNv2Pooling( + spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=16, + no_trans=False, + group_size=1, + trans_std=0.0, + ).cuda() offset = torch.randn(20, 2, 7, 7).cuda().zero_() dout = dpooling(input, rois, offset) - s = ', '.join(['%f' % dout[i, :, :, :].mean().item() - for i in range(rois.shape[0])]) + s = ", ".join(["%f" % dout[i, :, :, :].mean().item() for i in range(rois.shape[0])]) print(s) @@ -153,24 +181,34 @@ def check_gradient_dpooling(): sample_per_part = 4 part_size = pooled_size - print('check_gradient_dpooling:', - gradcheck(dcn_v2_pooling, (input, rois, offset, - spatial_scale, - pooled_size, - output_dim, - no_trans, - group_size, - part_size, - sample_per_part, - trans_std), - eps=1e-4)) + print( + "check_gradient_dpooling:", + gradcheck( + dcn_v2_pooling, + ( + input, + rois, + offset, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size, + part_size, + sample_per_part, + trans_std, + ), + eps=1e-4, + ), + ) def example_dconv(): input = torch.randn(2, 64, 128, 128).cuda() # wrap all things (offset and mask) in DCN - dcn = DCN(64, 64, kernel_size=(3, 3), stride=1, - padding=1, deformable_groups=2).cuda() + dcn = DCN( + 64, 64, kernel_size=(3, 3), stride=1, padding=1, deformable_groups=2 + ).cuda() # print(dcn.weight.shape, input.shape) output = dcn(input) targert = output.new(*output.size()) @@ -193,20 +231,24 @@ def example_dpooling(): offset.requires_grad = True # normal roi_align - pooling = DCNv2Pooling(spatial_scale=1.0 / 4, - pooled_size=7, - output_dim=32, - no_trans=True, - group_size=1, - trans_std=0.1).cuda() + pooling = DCNv2Pooling( + spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=32, + no_trans=True, + group_size=1, + trans_std=0.1, + ).cuda() # deformable pooling - dpooling = DCNv2Pooling(spatial_scale=1.0 / 4, - pooled_size=7, - output_dim=32, - no_trans=False, - group_size=1, - trans_std=0.1).cuda() + dpooling = DCNv2Pooling( + spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=32, + no_trans=False, + group_size=1, + trans_std=0.1, + ).cuda() out = pooling(input, rois, offset) dout = dpooling(input, rois, offset) @@ -234,13 +276,15 @@ def example_mdpooling(): rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) # mdformable pooling (V2) - dpooling = DCNPooling(spatial_scale=1.0 / 4, - pooled_size=7, - output_dim=32, - no_trans=False, - group_size=1, - trans_std=0.1, - deform_fc_dim=1024).cuda() + dpooling = DCNPooling( + spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=32, + no_trans=False, + group_size=1, + trans_std=0.1, + deform_fc_dim=1024, + ).cuda() dout = dpooling(input, rois) target = dout.new(*dout.size()) @@ -250,7 +294,7 @@ def example_mdpooling(): print(dout.shape) -if __name__ == '__main__': +if __name__ == "__main__": example_dconv() example_dpooling() diff --git a/layers/box_utils.py b/layers/box_utils.py index 543f7a1d8..0f8e6191f 100644 --- a/layers/box_utils.py +++ b/layers/box_utils.py @@ -4,6 +4,7 @@ from data import cfg + @torch.jit.script def point_form(boxes): """ Convert prior_boxes to (xmin, ymin, xmax, ymax) @@ -13,8 +14,13 @@ def point_form(boxes): Return: boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. """ - return torch.cat((boxes[:, :2] - boxes[:, 2:]/2, # xmin, ymin - boxes[:, :2] + boxes[:, 2:]/2), 1) # xmax, ymax + return torch.cat( + ( + boxes[:, :2] - boxes[:, 2:] / 2, # xmin, ymin + boxes[:, :2] + boxes[:, 2:] / 2, + ), + 1, + ) # xmax, ymax @torch.jit.script @@ -26,8 +32,10 @@ def center_size(boxes): Return: boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. """ - return torch.cat(( (boxes[:, 2:] + boxes[:, :2])/2, # cx, cy - boxes[:, 2:] - boxes[:, :2] ), 1) # w, h + return torch.cat( + ((boxes[:, 2:] + boxes[:, :2]) / 2, boxes[:, 2:] - boxes[:, :2]), 1 # cx, cy + ) # w, h + @torch.jit.script def intersect(box_a, box_b): @@ -44,15 +52,19 @@ def intersect(box_a, box_b): n = box_a.size(0) A = box_a.size(1) B = box_b.size(1) - max_xy = torch.min(box_a[:, :, 2:].unsqueeze(2).expand(n, A, B, 2), - box_b[:, :, 2:].unsqueeze(1).expand(n, A, B, 2)) - min_xy = torch.max(box_a[:, :, :2].unsqueeze(2).expand(n, A, B, 2), - box_b[:, :, :2].unsqueeze(1).expand(n, A, B, 2)) + max_xy = torch.min( + box_a[:, :, 2:].unsqueeze(2).expand(n, A, B, 2), + box_b[:, :, 2:].unsqueeze(1).expand(n, A, B, 2), + ) + min_xy = torch.max( + box_a[:, :, :2].unsqueeze(2).expand(n, A, B, 2), + box_b[:, :, :2].unsqueeze(1).expand(n, A, B, 2), + ) inter = torch.clamp((max_xy - min_xy), min=0) return inter[:, :, :, 0] * inter[:, :, :, 1] -def jaccard(box_a, box_b, iscrowd:bool=False): +def jaccard(box_a, box_b, iscrowd: bool = False): """Compute the jaccard overlap of two sets of boxes. The jaccard overlap is simply the intersection over union of two boxes. Here we operate on ground truth boxes and default boxes. If iscrowd=True, put the crowd in box_b. @@ -71,15 +83,22 @@ def jaccard(box_a, box_b, iscrowd:bool=False): box_b = box_b[None, ...] inter = intersect(box_a, box_b) - area_a = ((box_a[:, :, 2]-box_a[:, :, 0]) * - (box_a[:, :, 3]-box_a[:, :, 1])).unsqueeze(2).expand_as(inter) # [A,B] - area_b = ((box_b[:, :, 2]-box_b[:, :, 0]) * - (box_b[:, :, 3]-box_b[:, :, 1])).unsqueeze(1).expand_as(inter) # [A,B] + area_a = ( + ((box_a[:, :, 2] - box_a[:, :, 0]) * (box_a[:, :, 3] - box_a[:, :, 1])) + .unsqueeze(2) + .expand_as(inter) + ) # [A,B] + area_b = ( + ((box_b[:, :, 2] - box_b[:, :, 0]) * (box_b[:, :, 3] - box_b[:, :, 1])) + .unsqueeze(1) + .expand_as(inter) + ) # [A,B] union = area_a + area_b - inter out = inter / area_a if iscrowd else inter / union return out if use_batch else out.squeeze(0) + def elemwise_box_iou(box_a, box_b): """ Does the same as above but instead of pairwise, elementwise along the inner dimension. """ max_xy = torch.min(box_a[:, 2:], box_b[:, 2:]) @@ -96,6 +115,7 @@ def elemwise_box_iou(box_a, box_b): # Return value is [n] for inputs [n, 4] return torch.clamp(inter / union, max=1) + def mask_iou(masks_a, masks_b, iscrowd=False): """ Computes the pariwise mask IoU between two sets of masks of size [a, h, w] and [b, h, w]. @@ -111,7 +131,12 @@ def mask_iou(masks_a, masks_b, iscrowd=False): area_a = masks_a.sum(dim=1).unsqueeze(1) area_b = masks_b.sum(dim=1).unsqueeze(0) - return intersection / (area_a + area_b - intersection) if not iscrowd else intersection / area_a + return ( + intersection / (area_a + area_b - intersection) + if not iscrowd + else intersection / area_a + ) + def elemwise_mask_iou(masks_a, masks_b): """ Does the same as above but instead of pairwise, elementwise along the outer dimension. """ @@ -123,8 +148,9 @@ def elemwise_mask_iou(masks_a, masks_b): area_b = masks_b.sum(dim=0) # Return value is [n] for inputs [h, w, n] - return torch.clamp(intersection / torch.clamp(area_a + area_b - intersection, min=0.1), max=1) - + return torch.clamp( + intersection / torch.clamp(area_a + area_b - intersection, min=0.1), max=1 + ) def change(gt, priors): @@ -138,12 +164,12 @@ def change(gt, priors): Note this returns -change so it can be a drop in replacement for """ num_priors = priors.size(0) - num_gt = gt.size(0) + num_gt = gt.size(0) gt_w = (gt[:, 2] - gt[:, 0])[:, None].expand(num_gt, num_priors) gt_h = (gt[:, 3] - gt[:, 1])[:, None].expand(num_gt, num_priors) - gt_mat = gt[:, None, :].expand(num_gt, num_priors, 4) + gt_mat = gt[:, None, :].expand(num_gt, num_priors, 4) pr_mat = priors[None, :, :].expand(num_gt, num_priors, 4) diff = gt_mat - pr_mat @@ -152,12 +178,22 @@ def change(gt, priors): diff[:, :, 1] /= gt_h diff[:, :, 3] /= gt_h - return -torch.sqrt( (diff ** 2).sum(dim=2) ) - - - - -def match(pos_thresh, neg_thresh, truths, priors, labels, crowd_boxes, loc_t, conf_t, idx_t, idx, loc_data): + return -torch.sqrt((diff ** 2).sum(dim=2)) + + +def match( + pos_thresh, + neg_thresh, + truths, + priors, + labels, + crowd_boxes, + loc_t, + conf_t, + idx_t, + idx, + loc_data, +): """Match each prior box with the ground truth box of the highest jaccard overlap, encode the bounding boxes, then return the matched indices corresponding to both confidence and location preds. @@ -176,10 +212,18 @@ def match(pos_thresh, neg_thresh, truths, priors, labels, crowd_boxes, loc_t, co Return: The matched indices corresponding to 1)location and 2)confidence preds. """ - decoded_priors = decode(loc_data, priors, cfg.use_yolo_regressors) if cfg.use_prediction_matching else point_form(priors) - + decoded_priors = ( + decode(loc_data, priors, cfg.use_yolo_regressors) + if cfg.use_prediction_matching + else point_form(priors) + ) + # Size [num_objects, num_priors] - overlaps = jaccard(truths, decoded_priors) if not cfg.use_change_matching else change(truths, decoded_priors) + overlaps = ( + jaccard(truths, decoded_priors) + if not cfg.use_change_matching + else change(truths, decoded_priors) + ) # Size [num_priors] best ground truth for each prior best_truth_overlap, best_truth_idx = overlaps.max(0) @@ -207,11 +251,11 @@ def match(pos_thresh, neg_thresh, truths, priors, labels, crowd_boxes, loc_t, co # Set the gt to be used for i to be j, overwriting whatever was there best_truth_idx[i] = j - matches = truths[best_truth_idx] # Shape: [num_priors,4] - conf = labels[best_truth_idx] + 1 # Shape: [num_priors] + matches = truths[best_truth_idx] # Shape: [num_priors,4] + conf = labels[best_truth_idx] + 1 # Shape: [num_priors] conf[best_truth_overlap < pos_thresh] = -1 # label as neutral - conf[best_truth_overlap < neg_thresh] = 0 # label as background + conf[best_truth_overlap < neg_thresh] = 0 # label as background # Deal with crowd annotations for COCO if crowd_boxes is not None and cfg.crowd_iou_threshold < 1: @@ -223,12 +267,13 @@ def match(pos_thresh, neg_thresh, truths, priors, labels, crowd_boxes, loc_t, co conf[(conf <= 0) & (best_crowd_overlap > cfg.crowd_iou_threshold)] = -1 loc = encode(matches, priors, cfg.use_yolo_regressors) - loc_t[idx] = loc # [num_priors,4] encoded offsets to learn - conf_t[idx] = conf # [num_priors] top class label for each prior - idx_t[idx] = best_truth_idx # [num_priors] indices for lookup + loc_t[idx] = loc # [num_priors,4] encoded offsets to learn + conf_t[idx] = conf # [num_priors] top class label for each prior + idx_t[idx] = best_truth_idx # [num_priors] indices for lookup + @torch.jit.script -def encode(matched, priors, use_yolo_regressors:bool=False): +def encode(matched, priors, use_yolo_regressors: bool = False): """ Encode bboxes matched with each prior into the format produced by the network. See decode for more details on @@ -246,27 +291,27 @@ def encode(matched, priors, use_yolo_regressors:bool=False): # In fact encode(decode(x, p), p) should be x boxes = center_size(matched) - loc = torch.cat(( - boxes[:, :2] - priors[:, :2], - torch.log(boxes[:, 2:] / priors[:, 2:]) - ), 1) + loc = torch.cat( + (boxes[:, :2] - priors[:, :2], torch.log(boxes[:, 2:] / priors[:, 2:])), 1 + ) else: variances = [0.1, 0.2] # dist b/t match center and prior's center - g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2] + g_cxcy = (matched[:, :2] + matched[:, 2:]) / 2 - priors[:, :2] # encode variance - g_cxcy /= (variances[0] * priors[:, 2:]) + g_cxcy /= variances[0] * priors[:, 2:] # match wh / prior wh g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:] g_wh = torch.log(g_wh) / variances[1] # return target for smooth_l1_loss loc = torch.cat([g_cxcy, g_wh], 1) # [num_priors,4] - + return loc + @torch.jit.script -def decode(loc, priors, use_yolo_regressors:bool=False): +def decode(loc, priors, use_yolo_regressors: bool = False): """ Decode predicted bbox coordinates using the same scheme employed by Yolov2: https://arxiv.org/pdf/1612.08242.pdf @@ -295,23 +340,25 @@ def decode(loc, priors, use_yolo_regressors:bool=False): if use_yolo_regressors: # Decoded boxes in center-size notation - boxes = torch.cat(( - loc[:, :2] + priors[:, :2], - priors[:, 2:] * torch.exp(loc[:, 2:]) - ), 1) + boxes = torch.cat( + (loc[:, :2] + priors[:, :2], priors[:, 2:] * torch.exp(loc[:, 2:])), 1 + ) boxes = point_form(boxes) else: variances = [0.1, 0.2] - - boxes = torch.cat(( - priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], - priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) + + boxes = torch.cat( + ( + priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], + priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1]), + ), + 1, + ) boxes[:, :2] -= boxes[:, 2:] / 2 boxes[:, 2:] += boxes[:, :2] - - return boxes + return boxes def log_sum_exp(x): @@ -322,11 +369,11 @@ def log_sum_exp(x): x (Variable(tensor)): conf_preds from conf layers """ x_max = x.data.max() - return torch.log(torch.sum(torch.exp(x-x_max), 1)) + x_max + return torch.log(torch.sum(torch.exp(x - x_max), 1)) + x_max @torch.jit.script -def sanitize_coordinates(_x1, _x2, img_size:int, padding:int=0, cast:bool=True): +def sanitize_coordinates(_x1, _x2, img_size: int, padding: int = 0, cast: bool = True): """ Sanitizes the input coordinates so that x1 < x2, x1 != x2, x1 >= 0, and x2 <= image_size. Also converts from relative to absolute coordinates and casts the results to long tensors. @@ -341,14 +388,14 @@ def sanitize_coordinates(_x1, _x2, img_size:int, padding:int=0, cast:bool=True): _x2 = _x2.long() x1 = torch.min(_x1, _x2) x2 = torch.max(_x1, _x2) - x1 = torch.clamp(x1-padding, min=0) - x2 = torch.clamp(x2+padding, max=img_size) + x1 = torch.clamp(x1 - padding, min=0) + x2 = torch.clamp(x2 + padding, max=img_size) return x1, x2 @torch.jit.script -def crop(masks, boxes, padding:int=1): +def crop(masks, boxes, padding: int = 1): """ "Crop" predicted masks by zeroing out everything not in the predicted bbox. Vectorized by Chong (thanks Chong). @@ -361,16 +408,24 @@ def crop(masks, boxes, padding:int=1): x1, x2 = sanitize_coordinates(boxes[:, 0], boxes[:, 2], w, padding, cast=False) y1, y2 = sanitize_coordinates(boxes[:, 1], boxes[:, 3], h, padding, cast=False) - rows = torch.arange(w, device=masks.device, dtype=x1.dtype).view(1, -1, 1).expand(h, w, n) - cols = torch.arange(h, device=masks.device, dtype=x1.dtype).view(-1, 1, 1).expand(h, w, n) - - masks_left = rows >= x1.view(1, 1, -1) - masks_right = rows < x2.view(1, 1, -1) - masks_up = cols >= y1.view(1, 1, -1) - masks_down = cols < y2.view(1, 1, -1) - + rows = ( + torch.arange(w, device=masks.device, dtype=x1.dtype) + .view(1, -1, 1) + .expand(h, w, n) + ) + cols = ( + torch.arange(h, device=masks.device, dtype=x1.dtype) + .view(-1, 1, 1) + .expand(h, w, n) + ) + + masks_left = rows >= x1.view(1, 1, -1) + masks_right = rows < x2.view(1, 1, -1) + masks_up = cols >= y1.view(1, 1, -1) + masks_down = cols < y2.view(1, 1, -1) + crop_mask = masks_left * masks_right * masks_up * masks_down - + return masks * crop_mask.float() @@ -385,6 +440,6 @@ def index2d(src, idx): """ offs = torch.arange(idx.size(0), device=idx.device)[:, None].expand_as(idx) - idx = idx + offs * idx.size(1) + idx = idx + offs * idx.size(1) return src.view(-1)[idx.view(-1)].view(idx.size()) diff --git a/layers/functions/__init__.py b/layers/functions/__init__.py index 56ef07f46..643097bce 100644 --- a/layers/functions/__init__.py +++ b/layers/functions/__init__.py @@ -1,4 +1,4 @@ from .detection import Detect -__all__ = ['Detect'] +__all__ = ["Detect"] diff --git a/layers/functions/detection.py b/layers/functions/detection.py index 4e5fd068c..9ef723085 100644 --- a/layers/functions/detection.py +++ b/layers/functions/detection.py @@ -14,6 +14,7 @@ class Detect(object): scores and threshold to a top_k number of output predictions for both confidence score and locations, as the predicted masks. """ + # TODO: Refactor this whole class away. It needs to go. def __init__(self, num_classes, bkg_label, top_k, conf_thresh, nms_thresh): @@ -23,9 +24,9 @@ def __init__(self, num_classes, bkg_label, top_k, conf_thresh, nms_thresh): # Parameters used in nms. self.nms_thresh = nms_thresh if nms_thresh <= 0: - raise ValueError('nms_threshold must be non negative.') + raise ValueError("nms_threshold must be non negative.") self.conf_thresh = conf_thresh - + self.use_cross_class_nms = False self.use_fast_nms = False @@ -50,66 +51,78 @@ def __call__(self, predictions, net): Note that the outputs are sorted only if cross_class_nms is False """ - loc_data = predictions['loc'] - conf_data = predictions['conf'] - mask_data = predictions['mask'] - prior_data = predictions['priors'] + loc_data = predictions["loc"] + conf_data = predictions["conf"] + mask_data = predictions["mask"] + prior_data = predictions["priors"] - proto_data = predictions['proto'] if 'proto' in predictions else None - inst_data = predictions['inst'] if 'inst' in predictions else None + proto_data = predictions["proto"] if "proto" in predictions else None + inst_data = predictions["inst"] if "inst" in predictions else None out = [] - with timer.env('Detect'): + with timer.env("Detect"): batch_size = loc_data.size(0) num_priors = prior_data.size(0) - conf_preds = conf_data.view(batch_size, num_priors, self.num_classes).transpose(2, 1).contiguous() + conf_preds = ( + conf_data.view(batch_size, num_priors, self.num_classes) + .transpose(2, 1) + .contiguous() + ) for batch_idx in range(batch_size): decoded_boxes = decode(loc_data[batch_idx], prior_data) - result = self.detect(batch_idx, conf_preds, decoded_boxes, mask_data, inst_data) + result = self.detect( + batch_idx, conf_preds, decoded_boxes, mask_data, inst_data + ) if result is not None and proto_data is not None: - result['proto'] = proto_data[batch_idx] + result["proto"] = proto_data[batch_idx] - out.append({'detection': result, 'net': net}) - - return out + out.append({"detection": result, "net": net}) + return out def detect(self, batch_idx, conf_preds, decoded_boxes, mask_data, inst_data): """ Perform nms for only the max scoring class that isn't background (class 0) """ cur_scores = conf_preds[batch_idx, 1:, :] conf_scores, _ = torch.max(cur_scores, dim=0) - keep = (conf_scores > self.conf_thresh) + keep = conf_scores > self.conf_thresh scores = cur_scores[:, keep] boxes = decoded_boxes[keep, :] masks = mask_data[batch_idx, keep, :] if inst_data is not None: inst = inst_data[batch_idx, keep, :] - + if scores.size(1) == 0: return None - + if self.use_fast_nms: if self.use_cross_class_nms: - boxes, masks, classes, scores = self.cc_fast_nms(boxes, masks, scores, self.nms_thresh, self.top_k) + boxes, masks, classes, scores = self.cc_fast_nms( + boxes, masks, scores, self.nms_thresh, self.top_k + ) else: - boxes, masks, classes, scores = self.fast_nms(boxes, masks, scores, self.nms_thresh, self.top_k) + boxes, masks, classes, scores = self.fast_nms( + boxes, masks, scores, self.nms_thresh, self.top_k + ) else: - boxes, masks, classes, scores = self.traditional_nms(boxes, masks, scores, self.nms_thresh, self.conf_thresh) + boxes, masks, classes, scores = self.traditional_nms( + boxes, masks, scores, self.nms_thresh, self.conf_thresh + ) if self.use_cross_class_nms: - print('Warning: Cross Class Traditional NMS is not implemented.') + print("Warning: Cross Class Traditional NMS is not implemented.") - return {'box': boxes, 'mask': masks, 'class': classes, 'score': scores} + return {"box": boxes, "mask": masks, "class": classes, "score": scores} - - def cc_fast_nms(self, boxes, masks, scores, iou_threshold:float=0.5, top_k:int=200): - # Collapse all the classes into 1 + def cc_fast_nms( + self, boxes, masks, scores, iou_threshold: float = 0.5, top_k: int = 200 + ): + # Collapse all the classes into 1 scores, classes = scores.max(dim=0) _, idx = scores.sort(0, descending=True) @@ -119,7 +132,7 @@ def cc_fast_nms(self, boxes, masks, scores, iou_threshold:float=0.5, top_k:int=2 # Compute the pairwise IoU between the boxes iou = jaccard(boxes_idx, boxes_idx) - + # Zero out the lower triangle of the cosine similarity matrix and diagonal iou.triu_(diagonal=1) @@ -131,15 +144,23 @@ def cc_fast_nms(self, boxes, masks, scores, iou_threshold:float=0.5, top_k:int=2 # Now just filter out the ones greater than the threshold, i.e., only keep boxes that # don't have a higher scoring box that would supress it in normal NMS. idx_out = idx[iou_max <= iou_threshold] - + return boxes[idx_out], masks[idx_out], classes[idx_out], scores[idx_out] - def fast_nms(self, boxes, masks, scores, iou_threshold:float=0.5, top_k:int=200, second_threshold:bool=False): + def fast_nms( + self, + boxes, + masks, + scores, + iou_threshold: float = 0.5, + top_k: int = 200, + second_threshold: bool = False, + ): scores, idx = scores.sort(1, descending=True) idx = idx[:, :top_k].contiguous() scores = scores[:, :top_k] - + num_classes, num_dets = idx.size() boxes = boxes[idx.view(-1), :].view(num_classes, num_dets, 4) @@ -150,7 +171,7 @@ def fast_nms(self, boxes, masks, scores, iou_threshold:float=0.5, top_k:int=200, iou_max, _ = iou.max(dim=1) # Now just filter out the ones higher than the threshold - keep = (iou_max <= iou_threshold) + keep = iou_max <= iou_threshold # We should also only keep detections over the confidence threshold, but at the cost of # maxing out your detection count for every image, you can just not do that. Because we @@ -158,20 +179,22 @@ def fast_nms(self, boxes, masks, scores, iou_threshold:float=0.5, top_k:int=200, # this increase doesn't affect us much (+0.2 mAP for 34 -> 33 fps), so we leave it out. # However, when you implement this in your method, you should do this second threshold. if second_threshold: - keep *= (scores > self.conf_thresh) + keep *= scores > self.conf_thresh # Assign each kept detection to its corresponding class - classes = torch.arange(num_classes, device=boxes.device)[:, None].expand_as(keep) + classes = torch.arange(num_classes, device=boxes.device)[:, None].expand_as( + keep + ) classes = classes[keep] boxes = boxes[keep] masks = masks[keep] scores = scores[keep] - + # Only keep the top cfg.max_num_detections highest scores across all classes scores, idx = scores.sort(0, descending=True) - idx = idx[:cfg.max_num_detections] - scores = scores[:cfg.max_num_detections] + idx = idx[: cfg.max_num_detections] + scores = scores[: cfg.max_num_detections] classes = classes[idx] boxes = boxes[idx] @@ -179,9 +202,14 @@ def fast_nms(self, boxes, masks, scores, iou_threshold:float=0.5, top_k:int=200, return boxes, masks, classes, scores - def traditional_nms(self, boxes, masks, scores, iou_threshold=0.5, conf_thresh=0.05): + def traditional_nms( + self, boxes, masks, scores, iou_threshold=0.5, conf_thresh=0.05 + ): import pyximport - pyximport.install(setup_args={"include_dirs":np.get_include()}, reload_support=True) + + pyximport.install( + setup_args={"include_dirs": np.get_include()}, reload_support=True + ) from utils.cython_nms import nms as cnms @@ -204,22 +232,24 @@ def traditional_nms(self, boxes, masks, scores, iou_threshold=0.5, conf_thresh=0 if cls_scores.size(0) == 0: continue - - preds = torch.cat([boxes[conf_mask], cls_scores[:, None]], dim=1).cpu().numpy() + + preds = ( + torch.cat([boxes[conf_mask], cls_scores[:, None]], dim=1).cpu().numpy() + ) keep = cnms(preds, iou_threshold) keep = torch.Tensor(keep, device=boxes.device).long() idx_lst.append(idx[keep]) cls_lst.append(keep * 0 + _cls) scr_lst.append(cls_scores[keep]) - - idx = torch.cat(idx_lst, dim=0) + + idx = torch.cat(idx_lst, dim=0) classes = torch.cat(cls_lst, dim=0) - scores = torch.cat(scr_lst, dim=0) + scores = torch.cat(scr_lst, dim=0) scores, idx2 = scores.sort(0, descending=True) - idx2 = idx2[:cfg.max_num_detections] - scores = scores[:cfg.max_num_detections] + idx2 = idx2[: cfg.max_num_detections] + scores = scores[: cfg.max_num_detections] idx = idx[idx2] classes = classes[idx2] diff --git a/layers/interpolate.py b/layers/interpolate.py index 71419a80c..33767b2fb 100644 --- a/layers/interpolate.py +++ b/layers/interpolate.py @@ -1,17 +1,18 @@ import torch.nn as nn import torch.nn.functional as F + class InterpolateModule(nn.Module): - """ + """ This is a module version of F.interpolate (rip nn.Upsampling). Any arguments you give it just get passed along for the ride. """ - def __init__(self, *args, **kwdargs): - super().__init__() + def __init__(self, *args, **kwdargs): + super().__init__() - self.args = args - self.kwdargs = kwdargs + self.args = args + self.kwdargs = kwdargs - def forward(self, x): - return F.interpolate(x, *self.args, **self.kwdargs) + def forward(self, x): + return F.interpolate(x, *self.args, **self.kwdargs) diff --git a/layers/modules/__init__.py b/layers/modules/__init__.py index cf24bddbf..70c7e95bd 100644 --- a/layers/modules/__init__.py +++ b/layers/modules/__init__.py @@ -1,3 +1,3 @@ from .multibox_loss import MultiBoxLoss -__all__ = ['MultiBoxLoss'] +__all__ = ["MultiBoxLoss"] diff --git a/layers/modules/multibox_loss.py b/layers/modules/multibox_loss.py index ddf3904ce..e7e8ea88b 100644 --- a/layers/modules/multibox_loss.py +++ b/layers/modules/multibox_loss.py @@ -3,10 +3,19 @@ import torch.nn as nn import torch.nn.functional as F from torch.autograd import Variable -from ..box_utils import match, log_sum_exp, decode, center_size, crop, elemwise_mask_iou, elemwise_box_iou +from ..box_utils import ( + match, + log_sum_exp, + decode, + center_size, + crop, + elemwise_mask_iou, + elemwise_box_iou, +) from data import cfg, mask_type, activation_func + class MultiBoxLoss(nn.Module): """SSD Weighted Loss Function Compute Targets: @@ -33,14 +42,14 @@ class MultiBoxLoss(nn.Module): def __init__(self, num_classes, pos_threshold, neg_threshold, negpos_ratio): super(MultiBoxLoss, self).__init__() self.num_classes = num_classes - + self.pos_threshold = pos_threshold self.neg_threshold = neg_threshold self.negpos_ratio = negpos_ratio # If you output a proto mask with this area, your l1 loss will be l1_alpha # Note that the area is relative (so 1 would be the entire image) - self.l1_expected_area = 20*20/70/70 + self.l1_expected_area = 20 * 20 / 70 / 70 self.l1_alpha = 0.1 if cfg.use_class_balanced_conf: @@ -70,18 +79,18 @@ def forward(self, net, predictions, targets, masks, num_crowds): * Only if mask_type == lincomb """ - loc_data = predictions['loc'] - conf_data = predictions['conf'] - mask_data = predictions['mask'] - priors = predictions['priors'] + loc_data = predictions["loc"] + conf_data = predictions["conf"] + mask_data = predictions["mask"] + priors = predictions["priors"] if cfg.mask_type == mask_type.lincomb: - proto_data = predictions['proto'] + proto_data = predictions["proto"] + + score_data = predictions["score"] if cfg.use_mask_scoring else None + inst_data = predictions["inst"] if cfg.use_instance_coeff else None - score_data = predictions['score'] if cfg.use_mask_scoring else None - inst_data = predictions['inst'] if cfg.use_instance_coeff else None - - labels = [None] * len(targets) # Used in sem segm loss + labels = [None] * len(targets) # Used in sem segm loss batch_size = loc_data.size(0) num_priors = priors.size(0) @@ -95,16 +104,18 @@ def forward(self, net, predictions, targets, masks, num_crowds): idx_t = loc_data.new(batch_size, num_priors).long() if cfg.use_class_existence_loss: - class_existence_t = loc_data.new(batch_size, num_classes-1) + class_existence_t = loc_data.new(batch_size, num_classes - 1) for idx in range(batch_size): - truths = targets[idx][:, :-1].data + truths = targets[idx][:, :-1].data labels[idx] = targets[idx][:, -1].data.long() if cfg.use_class_existence_loss: # Construct a one-hot vector for each object and collapse it into an existence vector with max # Also it's fine to include the crowd annotations here - class_existence_t[idx, :] = torch.eye(num_classes-1, device=conf_t.get_device())[labels[idx]].max(dim=0)[0] + class_existence_t[idx, :] = torch.eye( + num_classes - 1, device=conf_t.get_device() + )[labels[idx]].max(dim=0)[0] # Split the crowd annotations because they come bundled in cur_crowds = num_crowds[idx] @@ -114,15 +125,24 @@ def forward(self, net, predictions, targets, masks, num_crowds): # We don't use the crowd labels or masks _, labels[idx] = split(labels[idx]) - _, masks[idx] = split(masks[idx]) + _, masks[idx] = split(masks[idx]) else: crowd_boxes = None - - match(self.pos_threshold, self.neg_threshold, - truths, priors.data, labels[idx], crowd_boxes, - loc_t, conf_t, idx_t, idx, loc_data[idx]) - + match( + self.pos_threshold, + self.neg_threshold, + truths, + priors.data, + labels[idx], + crowd_boxes, + loc_t, + conf_t, + idx_t, + idx, + loc_data[idx], + ) + gt_box_t[idx, :, :] = truths[idx_t[idx]] # wrap targets @@ -132,17 +152,19 @@ def forward(self, net, predictions, targets, masks, num_crowds): pos = conf_t > 0 num_pos = pos.sum(dim=1, keepdim=True) - + # Shape: [batch,num_priors,4] pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) - + losses = {} # Localization Loss (Smooth L1) if cfg.train_boxes: loc_p = loc_data[pos_idx].view(-1, 4) loc_t = loc_t[pos_idx].view(-1, 4) - losses['B'] = F.smooth_l1_loss(loc_p, loc_t, reduction='sum') * cfg.bbox_alpha + losses["B"] = ( + F.smooth_l1_loss(loc_p, loc_t, reduction="sum") * cfg.bbox_alpha + ) if cfg.train_masks: if cfg.mask_type == mask_type.direct: @@ -152,11 +174,30 @@ def forward(self, net, predictions, targets, masks, num_crowds): pos_masks.append(masks[idx][idx_t[idx, pos[idx]]]) masks_t = torch.cat(pos_masks, 0) masks_p = mask_data[pos, :].view(-1, cfg.mask_dim) - losses['M'] = F.binary_cross_entropy(torch.clamp(masks_p, 0, 1), masks_t, reduction='sum') * cfg.mask_alpha + losses["M"] = ( + F.binary_cross_entropy( + torch.clamp(masks_p, 0, 1), masks_t, reduction="sum" + ) + * cfg.mask_alpha + ) else: - losses['M'] = self.direct_mask_loss(pos_idx, idx_t, loc_data, mask_data, priors, masks) + losses["M"] = self.direct_mask_loss( + pos_idx, idx_t, loc_data, mask_data, priors, masks + ) elif cfg.mask_type == mask_type.lincomb: - ret = self.lincomb_mask_loss(pos, idx_t, loc_data, mask_data, priors, proto_data, masks, gt_box_t, score_data, inst_data, labels) + ret = self.lincomb_mask_loss( + pos, + idx_t, + loc_data, + mask_data, + priors, + proto_data, + masks, + gt_box_t, + score_data, + inst_data, + labels, + ) if cfg.use_maskiou: loss, maskiou_targets = ret else: @@ -164,40 +205,52 @@ def forward(self, net, predictions, targets, masks, num_crowds): losses.update(loss) if cfg.mask_proto_loss is not None: - if cfg.mask_proto_loss == 'l1': - losses['P'] = torch.mean(torch.abs(proto_data)) / self.l1_expected_area * self.l1_alpha - elif cfg.mask_proto_loss == 'disj': - losses['P'] = -torch.mean(torch.max(F.log_softmax(proto_data, dim=-1), dim=-1)[0]) + if cfg.mask_proto_loss == "l1": + losses["P"] = ( + torch.mean(torch.abs(proto_data)) + / self.l1_expected_area + * self.l1_alpha + ) + elif cfg.mask_proto_loss == "disj": + losses["P"] = -torch.mean( + torch.max(F.log_softmax(proto_data, dim=-1), dim=-1)[0] + ) # Confidence loss if cfg.use_focal_loss: if cfg.use_sigmoid_focal_loss: - losses['C'] = self.focal_conf_sigmoid_loss(conf_data, conf_t) + losses["C"] = self.focal_conf_sigmoid_loss(conf_data, conf_t) elif cfg.use_objectness_score: - losses['C'] = self.focal_conf_objectness_loss(conf_data, conf_t) + losses["C"] = self.focal_conf_objectness_loss(conf_data, conf_t) else: - losses['C'] = self.focal_conf_loss(conf_data, conf_t) + losses["C"] = self.focal_conf_loss(conf_data, conf_t) else: if cfg.use_objectness_score: - losses['C'] = self.conf_objectness_loss(conf_data, conf_t, batch_size, loc_p, loc_t, priors) + losses["C"] = self.conf_objectness_loss( + conf_data, conf_t, batch_size, loc_p, loc_t, priors + ) else: - losses['C'] = self.ohem_conf_loss(conf_data, conf_t, pos, batch_size) + losses["C"] = self.ohem_conf_loss(conf_data, conf_t, pos, batch_size) # Mask IoU Loss if cfg.use_maskiou and maskiou_targets is not None: - losses['I'] = self.mask_iou_loss(net, maskiou_targets) + losses["I"] = self.mask_iou_loss(net, maskiou_targets) # These losses also don't depend on anchors if cfg.use_class_existence_loss: - losses['E'] = self.class_existence_loss(predictions['classes'], class_existence_t) + losses["E"] = self.class_existence_loss( + predictions["classes"], class_existence_t + ) if cfg.use_semantic_segmentation_loss: - losses['S'] = self.semantic_segmentation_loss(predictions['segm'], masks, labels) + losses["S"] = self.semantic_segmentation_loss( + predictions["segm"], masks, labels + ) # Divide all losses by the number of positives. # Don't do it for loss[P] because that doesn't depend on the anchors. total_num_pos = num_pos.data.sum().float() for k in losses: - if k not in ('P', 'E', 'S'): + if k not in ("P", "E", "S"): losses[k] /= total_num_pos else: losses[k] /= batch_size @@ -213,37 +266,48 @@ def forward(self, net, predictions, targets, masks, num_crowds): return losses def class_existence_loss(self, class_data, class_existence_t): - return cfg.class_existence_alpha * F.binary_cross_entropy_with_logits(class_data, class_existence_t, reduction='sum') + return cfg.class_existence_alpha * F.binary_cross_entropy_with_logits( + class_data, class_existence_t, reduction="sum" + ) - def semantic_segmentation_loss(self, segment_data, mask_t, class_t, interpolation_mode='bilinear'): + def semantic_segmentation_loss( + self, segment_data, mask_t, class_t, interpolation_mode="bilinear" + ): # Note num_classes here is without the background class so cfg.num_classes-1 batch_size, num_classes, mask_h, mask_w = segment_data.size() loss_s = 0 - + for idx in range(batch_size): cur_segment = segment_data[idx] cur_class_t = class_t[idx] with torch.no_grad(): - downsampled_masks = F.interpolate(mask_t[idx].unsqueeze(0), (mask_h, mask_w), - mode=interpolation_mode, align_corners=False).squeeze(0) + downsampled_masks = F.interpolate( + mask_t[idx].unsqueeze(0), + (mask_h, mask_w), + mode=interpolation_mode, + align_corners=False, + ).squeeze(0) downsampled_masks = downsampled_masks.gt(0.5).float() - + # Construct Semantic Segmentation segment_t = torch.zeros_like(cur_segment, requires_grad=False) for obj_idx in range(downsampled_masks.size(0)): - segment_t[cur_class_t[obj_idx]] = torch.max(segment_t[cur_class_t[obj_idx]], downsampled_masks[obj_idx]) - - loss_s += F.binary_cross_entropy_with_logits(cur_segment, segment_t, reduction='sum') - - return loss_s / mask_h / mask_w * cfg.semantic_segmentation_alpha + segment_t[cur_class_t[obj_idx]] = torch.max( + segment_t[cur_class_t[obj_idx]], downsampled_masks[obj_idx] + ) + loss_s += F.binary_cross_entropy_with_logits( + cur_segment, segment_t, reduction="sum" + ) + + return loss_s / mask_h / mask_w * cfg.semantic_segmentation_alpha def ohem_conf_loss(self, conf_data, conf_t, pos, num): # Compute max conf across batch for hard negative mining batch_conf = conf_data.view(-1, self.num_classes) if cfg.ohem_use_most_confident: - # i.e. max(softmax) along classes > 0 + # i.e. max(softmax) along classes > 0 batch_conf = F.softmax(batch_conf, dim=1) loss_c, _ = batch_conf[:, 1:].max(dim=1) else: @@ -252,39 +316,43 @@ def ohem_conf_loss(self, conf_data, conf_t, pos, num): # Hard Negative Mining loss_c = loss_c.view(num, -1) - loss_c[pos] = 0 # filter out pos boxes - loss_c[conf_t < 0] = 0 # filter out neutrals (conf_t = -1) + loss_c[pos] = 0 # filter out pos boxes + loss_c[conf_t < 0] = 0 # filter out neutrals (conf_t = -1) _, loss_idx = loss_c.sort(1, descending=True) _, idx_rank = loss_idx.sort(1) num_pos = pos.long().sum(1, keepdim=True) - num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1) + num_neg = torch.clamp(self.negpos_ratio * num_pos, max=pos.size(1) - 1) neg = idx_rank < num_neg.expand_as(idx_rank) - + # Just in case there aren't enough negatives, don't start using positives as negatives - neg[pos] = 0 - neg[conf_t < 0] = 0 # Filter out neutrals + neg[pos] = 0 + neg[conf_t < 0] = 0 # Filter out neutrals # Confidence Loss Including Positive and Negative Examples pos_idx = pos.unsqueeze(2).expand_as(conf_data) neg_idx = neg.unsqueeze(2).expand_as(conf_data) - conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes) - targets_weighted = conf_t[(pos+neg).gt(0)] - loss_c = F.cross_entropy(conf_p, targets_weighted, reduction='none') + conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view(-1, self.num_classes) + targets_weighted = conf_t[(pos + neg).gt(0)] + loss_c = F.cross_entropy(conf_p, targets_weighted, reduction="none") if cfg.use_class_balanced_conf: # Lazy initialization if self.class_instances is None: - self.class_instances = torch.zeros(self.num_classes, device=targets_weighted.device) - + self.class_instances = torch.zeros( + self.num_classes, device=targets_weighted.device + ) + classes, counts = targets_weighted.unique(return_counts=True) - + for _cls, _cnt in zip(classes.cpu().numpy(), counts.cpu().numpy()): self.class_instances[_cls] += _cnt self.total_instances += targets_weighted.size(0) - weighting = 1 - (self.class_instances[targets_weighted] / self.total_instances) - weighting = torch.clamp(weighting, min=1/self.num_classes) + weighting = 1 - ( + self.class_instances[targets_weighted] / self.total_instances + ) + weighting = torch.clamp(weighting, min=1 / self.num_classes) # If you do the math, the average weight of self.class_instances is this avg_weight = (self.num_classes - 1) / self.num_classes @@ -292,7 +360,7 @@ def ohem_conf_loss(self, conf_data, conf_t, pos, num): loss_c = (loss_c * weighting).sum() / avg_weight else: loss_c = loss_c.sum() - + return cfg.conf_alpha * loss_c def focal_conf_loss(self, conf_data, conf_t): @@ -301,30 +369,34 @@ def focal_conf_loss(self, conf_data, conf_t): Adapted from https://github.com/clcarwin/focal_loss_pytorch/blob/master/focalloss.py Note that this uses softmax and not the original sigmoid from the paper. """ - conf_t = conf_t.view(-1) # [batch_size*num_priors] - conf_data = conf_data.view(-1, conf_data.size(-1)) # [batch_size*num_priors, num_classes] + conf_t = conf_t.view(-1) # [batch_size*num_priors] + conf_data = conf_data.view( + -1, conf_data.size(-1) + ) # [batch_size*num_priors, num_classes] # Ignore neutral samples (class < 0) keep = (conf_t >= 0).float() - conf_t[conf_t < 0] = 0 # so that gather doesn't drum up a fuss + conf_t[conf_t < 0] = 0 # so that gather doesn't drum up a fuss logpt = F.log_softmax(conf_data, dim=-1) logpt = logpt.gather(1, conf_t.unsqueeze(-1)) logpt = logpt.view(-1) - pt = logpt.exp() + pt = logpt.exp() # I adapted the alpha_t calculation here from # https://github.com/pytorch/pytorch/blob/master/modules/detectron/softmax_focal_loss_op.cu # You'd think you want all the alphas to sum to one, but in the original implementation they # just give background an alpha of 1-alpha and each forground an alpha of alpha. background = (conf_t == 0).float() - at = (1 - cfg.focal_loss_alpha) * background + cfg.focal_loss_alpha * (1 - background) + at = (1 - cfg.focal_loss_alpha) * background + cfg.focal_loss_alpha * ( + 1 - background + ) loss = -at * (1 - pt) ** cfg.focal_loss_gamma * logpt # See comment above for keep return cfg.conf_alpha * (loss * keep).sum() - + def focal_conf_sigmoid_loss(self, conf_data, conf_t): """ Focal loss but using sigmoid like the original paper. @@ -333,29 +405,39 @@ def focal_conf_sigmoid_loss(self, conf_data, conf_t): """ num_classes = conf_data.size(-1) - conf_t = conf_t.view(-1) # [batch_size*num_priors] - conf_data = conf_data.view(-1, num_classes) # [batch_size*num_priors, num_classes] + conf_t = conf_t.view(-1) # [batch_size*num_priors] + conf_data = conf_data.view( + -1, num_classes + ) # [batch_size*num_priors, num_classes] # Ignore neutral samples (class < 0) keep = (conf_t >= 0).float() - conf_t[conf_t < 0] = 0 # can't mask with -1, so filter that out + conf_t[conf_t < 0] = 0 # can't mask with -1, so filter that out # Compute a one-hot embedding of conf_t # From https://github.com/kuangliu/pytorch-retinanet/blob/master/utils.py conf_one_t = torch.eye(num_classes, device=conf_t.get_device())[conf_t] - conf_pm_t = conf_one_t * 2 - 1 # -1 if background, +1 if forground for specific class - - logpt = F.logsigmoid(conf_data * conf_pm_t) # note: 1 - sigmoid(x) = sigmoid(-x) - pt = logpt.exp() - - at = cfg.focal_loss_alpha * conf_one_t + (1 - cfg.focal_loss_alpha) * (1 - conf_one_t) - at[..., 0] = 0 # Set alpha for the background class to 0 because sigmoid focal loss doesn't use it + conf_pm_t = ( + conf_one_t * 2 - 1 + ) # -1 if background, +1 if forground for specific class + + logpt = F.logsigmoid( + conf_data * conf_pm_t + ) # note: 1 - sigmoid(x) = sigmoid(-x) + pt = logpt.exp() + + at = cfg.focal_loss_alpha * conf_one_t + (1 - cfg.focal_loss_alpha) * ( + 1 - conf_one_t + ) + at[ + ..., 0 + ] = 0 # Set alpha for the background class to 0 because sigmoid focal loss doesn't use it loss = -at * (1 - pt) ** cfg.focal_loss_gamma * logpt loss = keep * loss.sum(dim=-1) return cfg.conf_alpha * loss.sum() - + def focal_conf_objectness_loss(self, conf_data, conf_t): """ Instead of using softmax, use class[0] to be the objectness score and do sigmoid focal loss on that. @@ -365,69 +447,83 @@ def focal_conf_objectness_loss(self, conf_data, conf_t): similar during test-time to softmax by setting class[1:] = softmax(class[1:]) * class[0] and invert class[0]. """ - conf_t = conf_t.view(-1) # [batch_size*num_priors] - conf_data = conf_data.view(-1, conf_data.size(-1)) # [batch_size*num_priors, num_classes] + conf_t = conf_t.view(-1) # [batch_size*num_priors] + conf_data = conf_data.view( + -1, conf_data.size(-1) + ) # [batch_size*num_priors, num_classes] # Ignore neutral samples (class < 0) keep = (conf_t >= 0).float() - conf_t[conf_t < 0] = 0 # so that gather doesn't drum up a fuss + conf_t[conf_t < 0] = 0 # so that gather doesn't drum up a fuss background = (conf_t == 0).float() - at = (1 - cfg.focal_loss_alpha) * background + cfg.focal_loss_alpha * (1 - background) + at = (1 - cfg.focal_loss_alpha) * background + cfg.focal_loss_alpha * ( + 1 - background + ) - logpt = F.logsigmoid(conf_data[:, 0]) * (1 - background) + F.logsigmoid(-conf_data[:, 0]) * background - pt = logpt.exp() + logpt = ( + F.logsigmoid(conf_data[:, 0]) * (1 - background) + + F.logsigmoid(-conf_data[:, 0]) * background + ) + pt = logpt.exp() obj_loss = -at * (1 - pt) ** cfg.focal_loss_gamma * logpt # All that was the objectiveness loss--now time for the class confidence loss pos_mask = conf_t > 0 - conf_data_pos = (conf_data[:, 1:])[pos_mask] # Now this has just 80 classes - conf_t_pos = conf_t[pos_mask] - 1 # So subtract 1 here + conf_data_pos = (conf_data[:, 1:])[pos_mask] # Now this has just 80 classes + conf_t_pos = conf_t[pos_mask] - 1 # So subtract 1 here - class_loss = F.cross_entropy(conf_data_pos, conf_t_pos, reduction='sum') + class_loss = F.cross_entropy(conf_data_pos, conf_t_pos, reduction="sum") return cfg.conf_alpha * (class_loss + (obj_loss * keep).sum()) - + def conf_objectness_loss(self, conf_data, conf_t, batch_size, loc_p, loc_t, priors): """ Instead of using softmax, use class[0] to be p(obj) * p(IoU) as in YOLO. Then for the rest of the classes, softmax them and apply CE for only the positive examples. """ - conf_t = conf_t.view(-1) # [batch_size*num_priors] - conf_data = conf_data.view(-1, conf_data.size(-1)) # [batch_size*num_priors, num_classes] + conf_t = conf_t.view(-1) # [batch_size*num_priors] + conf_data = conf_data.view( + -1, conf_data.size(-1) + ) # [batch_size*num_priors, num_classes] - pos_mask = (conf_t > 0) - neg_mask = (conf_t == 0) + pos_mask = conf_t > 0 + neg_mask = conf_t == 0 obj_data = conf_data[:, 0] obj_data_pos = obj_data[pos_mask] obj_data_neg = obj_data[neg_mask] # Don't be confused, this is just binary cross entropy similified - obj_neg_loss = - F.logsigmoid(-obj_data_neg).sum() + obj_neg_loss = -F.logsigmoid(-obj_data_neg).sum() with torch.no_grad(): - pos_priors = priors.unsqueeze(0).expand(batch_size, -1, -1).reshape(-1, 4)[pos_mask, :] + pos_priors = ( + priors.unsqueeze(0) + .expand(batch_size, -1, -1) + .reshape(-1, 4)[pos_mask, :] + ) boxes_pred = decode(loc_p, pos_priors, cfg.use_yolo_regressors) boxes_targ = decode(loc_t, pos_priors, cfg.use_yolo_regressors) iou_targets = elemwise_box_iou(boxes_pred, boxes_targ) - obj_pos_loss = - iou_targets * F.logsigmoid(obj_data_pos) - (1 - iou_targets) * F.logsigmoid(-obj_data_pos) + obj_pos_loss = -iou_targets * F.logsigmoid(obj_data_pos) - ( + 1 - iou_targets + ) * F.logsigmoid(-obj_data_pos) obj_pos_loss = obj_pos_loss.sum() # All that was the objectiveness loss--now time for the class confidence loss - conf_data_pos = (conf_data[:, 1:])[pos_mask] # Now this has just 80 classes - conf_t_pos = conf_t[pos_mask] - 1 # So subtract 1 here + conf_data_pos = (conf_data[:, 1:])[pos_mask] # Now this has just 80 classes + conf_t_pos = conf_t[pos_mask] - 1 # So subtract 1 here - class_loss = F.cross_entropy(conf_data_pos, conf_t_pos, reduction='sum') + class_loss = F.cross_entropy(conf_data_pos, conf_t_pos, reduction="sum") return cfg.conf_alpha * (class_loss + obj_pos_loss + obj_neg_loss) - def direct_mask_loss(self, pos_idx, idx_t, loc_data, mask_data, priors, masks): """ Crops the gt masks using the predicted bboxes, scales them down, and outputs the BCE loss. """ loss_m = 0 @@ -437,40 +533,54 @@ def direct_mask_loss(self, pos_idx, idx_t, loc_data, mask_data, priors, masks): cur_pos_idx_squeezed = cur_pos_idx[:, 1] # Shape: [num_priors, 4], decoded predicted bboxes - pos_bboxes = decode(loc_data[idx, :, :], priors.data, cfg.use_yolo_regressors) + pos_bboxes = decode( + loc_data[idx, :, :], priors.data, cfg.use_yolo_regressors + ) pos_bboxes = pos_bboxes[cur_pos_idx].view(-1, 4).clamp(0, 1) pos_lookup = idx_t[idx, cur_pos_idx_squeezed] cur_masks = masks[idx] pos_masks = cur_masks[pos_lookup, :, :] - + # Convert bboxes to absolute coordinates num_pos, img_height, img_width = pos_masks.size() # Take care of all the bad behavior that can be caused by out of bounds coordinates - x1, x2 = sanitize_coordinates(pos_bboxes[:, 0], pos_bboxes[:, 2], img_width) - y1, y2 = sanitize_coordinates(pos_bboxes[:, 1], pos_bboxes[:, 3], img_height) + x1, x2 = sanitize_coordinates( + pos_bboxes[:, 0], pos_bboxes[:, 2], img_width + ) + y1, y2 = sanitize_coordinates( + pos_bboxes[:, 1], pos_bboxes[:, 3], img_height + ) # Crop each gt mask with the predicted bbox and rescale to the predicted mask size # Note that each bounding box crop is a different size so I don't think we can vectorize this scaled_masks = [] for jdx in range(num_pos): - tmp_mask = pos_masks[jdx, y1[jdx]:y2[jdx], x1[jdx]:x2[jdx]] + tmp_mask = pos_masks[jdx, y1[jdx] : y2[jdx], x1[jdx] : x2[jdx]] # Restore any dimensions we've left out because our bbox was 1px wide while tmp_mask.dim() < 2: tmp_mask = tmp_mask.unsqueeze(0) - new_mask = F.adaptive_avg_pool2d(tmp_mask.unsqueeze(0), cfg.mask_size) + new_mask = F.adaptive_avg_pool2d( + tmp_mask.unsqueeze(0), cfg.mask_size + ) scaled_masks.append(new_mask.view(1, -1)) - mask_t = torch.cat(scaled_masks, 0).gt(0.5).float() # Threshold downsampled mask - + mask_t = ( + torch.cat(scaled_masks, 0).gt(0.5).float() + ) # Threshold downsampled mask + pos_mask_data = mask_data[idx, cur_pos_idx_squeezed, :] - loss_m += F.binary_cross_entropy(torch.clamp(pos_mask_data, 0, 1), mask_t, reduction='sum') * cfg.mask_alpha + loss_m += ( + F.binary_cross_entropy( + torch.clamp(pos_mask_data, 0, 1), mask_t, reduction="sum" + ) + * cfg.mask_alpha + ) return loss_m - def coeff_diversity_loss(self, coeffs, instance_t): """ @@ -478,12 +588,15 @@ def coeff_diversity_loss(self, coeffs, instance_t): instance_t should be size [num_pos] and be values from 0 to num_instances-1 """ num_pos = coeffs.size(0) - instance_t = instance_t.view(-1) # juuuust to make sure + instance_t = instance_t.view(-1) # juuuust to make sure coeffs_norm = F.normalize(coeffs, dim=1) cos_sim = coeffs_norm @ coeffs_norm.t() - inst_eq = (instance_t[:, None].expand_as(cos_sim) == instance_t[None, :].expand_as(cos_sim)).float() + inst_eq = ( + instance_t[:, None].expand_as(cos_sim) + == instance_t[None, :].expand_as(cos_sim) + ).float() # Rescale to be between 0 and 1 cos_sim = (cos_sim + 1) / 2 @@ -495,19 +608,34 @@ def coeff_diversity_loss(self, coeffs, instance_t): # and all the losses will be divided by num_pos at the end, so just one extra time. return cfg.mask_proto_coeff_diversity_alpha * loss.sum() / num_pos - - def lincomb_mask_loss(self, pos, idx_t, loc_data, mask_data, priors, proto_data, masks, gt_box_t, score_data, inst_data, labels, interpolation_mode='bilinear'): + def lincomb_mask_loss( + self, + pos, + idx_t, + loc_data, + mask_data, + priors, + proto_data, + masks, + gt_box_t, + score_data, + inst_data, + labels, + interpolation_mode="bilinear", + ): mask_h = proto_data.size(1) mask_w = proto_data.size(2) - process_gt_bboxes = cfg.mask_proto_normalize_emulate_roi_pooling or cfg.mask_proto_crop + process_gt_bboxes = ( + cfg.mask_proto_normalize_emulate_roi_pooling or cfg.mask_proto_crop + ) if cfg.mask_proto_remove_empty_masks: # Make sure to store a copy of this because we edit it to get rid of all-zero masks pos = pos.clone() loss_m = 0 - loss_d = 0 # Coefficient diversity loss + loss_d = 0 # Coefficient diversity loss maskiou_t_list = [] maskiou_net_input_list = [] @@ -515,8 +643,12 @@ def lincomb_mask_loss(self, pos, idx_t, loc_data, mask_data, priors, proto_data, for idx in range(mask_data.size(0)): with torch.no_grad(): - downsampled_masks = F.interpolate(masks[idx].unsqueeze(0), (mask_h, mask_w), - mode=interpolation_mode, align_corners=False).squeeze(0) + downsampled_masks = F.interpolate( + masks[idx].unsqueeze(0), + (mask_h, mask_w), + mode=interpolation_mode, + align_corners=False, + ).squeeze(0) downsampled_masks = downsampled_masks.permute(1, 2, 0).contiguous() if cfg.mask_proto_binarize_downsampled_gt: @@ -524,7 +656,7 @@ def lincomb_mask_loss(self, pos, idx_t, loc_data, mask_data, priors, proto_data, if cfg.mask_proto_remove_empty_masks: # Get rid of gt masks that are so small they get downsampled away - very_small_masks = (downsampled_masks.sum(dim=(0,1)) <= 0.0001) + very_small_masks = downsampled_masks.sum(dim=(0, 1)) <= 0.0001 for i in range(very_small_masks.size(0)): if very_small_masks[i]: pos[idx, idx_t[idx] == i] = 0 @@ -536,19 +668,28 @@ def lincomb_mask_loss(self, pos, idx_t, loc_data, mask_data, priors, proto_data, else: bin_gt = downsampled_masks - gt_foreground_norm = bin_gt / (torch.sum(bin_gt, dim=(0,1), keepdim=True) + 0.0001) - gt_background_norm = (1-bin_gt) / (torch.sum(1-bin_gt, dim=(0,1), keepdim=True) + 0.0001) + gt_foreground_norm = bin_gt / ( + torch.sum(bin_gt, dim=(0, 1), keepdim=True) + 0.0001 + ) + gt_background_norm = (1 - bin_gt) / ( + torch.sum(1 - bin_gt, dim=(0, 1), keepdim=True) + 0.0001 + ) - mask_reweighting = gt_foreground_norm * cfg.mask_proto_reweight_coeff + gt_background_norm - mask_reweighting *= mask_h * mask_w + mask_reweighting = ( + gt_foreground_norm * cfg.mask_proto_reweight_coeff + + gt_background_norm + ) + mask_reweighting *= mask_h * mask_w cur_pos = pos[idx] pos_idx_t = idx_t[idx, cur_pos] - + if process_gt_bboxes: # Note: this is in point-form if cfg.mask_proto_crop_with_pred_box: - pos_gt_box_t = decode(loc_data[idx, :, :], priors.data, cfg.use_yolo_regressors)[cur_pos] + pos_gt_box_t = decode( + loc_data[idx, :, :], priors.data, cfg.use_yolo_regressors + )[cur_pos] else: pos_gt_box_t = gt_box_t[idx, cur_pos] @@ -556,7 +697,7 @@ def lincomb_mask_loss(self, pos, idx_t, loc_data, mask_data, priors, proto_data, continue proto_masks = proto_data[idx] - proto_coef = mask_data[idx, cur_pos, :] + proto_coef = mask_data[idx, cur_pos, :] if cfg.use_mask_scoring: mask_scores = score_data[idx, cur_pos, :] @@ -567,24 +708,24 @@ def lincomb_mask_loss(self, pos, idx_t, loc_data, mask_data, priors, proto_data, div_coeffs = proto_coef loss_d += self.coeff_diversity_loss(div_coeffs, pos_idx_t) - + # If we have over the allowed number of masks, select a random sample old_num_pos = proto_coef.size(0) if old_num_pos > cfg.masks_to_train: perm = torch.randperm(proto_coef.size(0)) - select = perm[:cfg.masks_to_train] + select = perm[: cfg.masks_to_train] proto_coef = proto_coef[select, :] - pos_idx_t = pos_idx_t[select] - + pos_idx_t = pos_idx_t[select] + if process_gt_bboxes: pos_gt_box_t = pos_gt_box_t[select, :] if cfg.use_mask_scoring: mask_scores = mask_scores[select, :] num_pos = proto_coef.size(0) - mask_t = downsampled_masks[:, :, pos_idx_t] - label_t = labels[idx][pos_idx_t] + mask_t = downsampled_masks[:, :, pos_idx_t] + label_t = labels[idx][pos_idx_t] # Size: [mask_h, mask_w, num_pos] pred_masks = proto_masks @ proto_coef.t() @@ -592,33 +733,39 @@ def lincomb_mask_loss(self, pos, idx_t, loc_data, mask_data, priors, proto_data, if cfg.mask_proto_double_loss: if cfg.mask_proto_mask_activation == activation_func.sigmoid: - pre_loss = F.binary_cross_entropy(torch.clamp(pred_masks, 0, 1), mask_t, reduction='sum') + pre_loss = F.binary_cross_entropy( + torch.clamp(pred_masks, 0, 1), mask_t, reduction="sum" + ) else: - pre_loss = F.smooth_l1_loss(pred_masks, mask_t, reduction='sum') - + pre_loss = F.smooth_l1_loss(pred_masks, mask_t, reduction="sum") + loss_m += cfg.mask_proto_double_loss_alpha * pre_loss if cfg.mask_proto_crop: pred_masks = crop(pred_masks, pos_gt_box_t) - + if cfg.mask_proto_mask_activation == activation_func.sigmoid: - pre_loss = F.binary_cross_entropy(torch.clamp(pred_masks, 0, 1), mask_t, reduction='none') + pre_loss = F.binary_cross_entropy( + torch.clamp(pred_masks, 0, 1), mask_t, reduction="none" + ) else: - pre_loss = F.smooth_l1_loss(pred_masks, mask_t, reduction='none') + pre_loss = F.smooth_l1_loss(pred_masks, mask_t, reduction="none") if cfg.mask_proto_normalize_mask_loss_by_sqrt_area: - gt_area = torch.sum(mask_t, dim=(0, 1), keepdim=True) + gt_area = torch.sum(mask_t, dim=(0, 1), keepdim=True) pre_loss = pre_loss / (torch.sqrt(gt_area) + 0.0001) - + if cfg.mask_proto_reweight_mask_loss: pre_loss = pre_loss * mask_reweighting[:, :, pos_idx_t] - + if cfg.mask_proto_normalize_emulate_roi_pooling: weight = mask_h * mask_w if cfg.mask_proto_crop else 1 pos_gt_csize = center_size(pos_gt_box_t) - gt_box_width = pos_gt_csize[:, 2] * mask_w + gt_box_width = pos_gt_csize[:, 2] * mask_w gt_box_height = pos_gt_csize[:, 3] * mask_h - pre_loss = pre_loss.sum(dim=(0, 1)) / gt_box_width / gt_box_height * weight + pre_loss = ( + pre_loss.sum(dim=(0, 1)) / gt_box_width / gt_box_height * weight + ) # If the number of masks were limited scale the loss accordingly if old_num_pos > num_pos: @@ -639,18 +786,20 @@ def lincomb_mask_loss(self, pos, idx_t, loc_data, mask_data, priors, proto_data, mask_t = mask_t[:, :, select] label_t = label_t[select] - maskiou_net_input = pred_masks.permute(2, 0, 1).contiguous().unsqueeze(1) - pred_masks = pred_masks.gt(0.5).float() + maskiou_net_input = ( + pred_masks.permute(2, 0, 1).contiguous().unsqueeze(1) + ) + pred_masks = pred_masks.gt(0.5).float() maskiou_t = self._mask_iou(pred_masks, mask_t) - + maskiou_net_input_list.append(maskiou_net_input) maskiou_t_list.append(maskiou_t) label_t_list.append(label_t) - - losses = {'M': loss_m * cfg.mask_alpha / mask_h / mask_w} - + + losses = {"M": loss_m * cfg.mask_alpha / mask_h / mask_w} + if cfg.mask_proto_coeff_diversity_loss: - losses['D'] = loss_d + losses["D"] = loss_d if cfg.use_maskiou: # discard_mask_area discarded every mask in the batch, so nothing to do here @@ -664,7 +813,7 @@ def lincomb_mask_loss(self, pos, idx_t, loc_data, mask_data, priors, proto_data, num_samples = maskiou_t.size(0) if cfg.maskious_to_train > 0 and num_samples > cfg.maskious_to_train: perm = torch.randperm(num_samples) - select = perm[:cfg.masks_to_train] + select = perm[: cfg.masks_to_train] maskiou_t = maskiou_t[select] label_t = label_t[select] maskiou_net_input = maskiou_net_input[select] @@ -674,7 +823,7 @@ def lincomb_mask_loss(self, pos, idx_t, loc_data, mask_data, priors, proto_data, return losses def _mask_iou(self, mask1, mask2): - intersection = torch.sum(mask1*mask2, dim=(0, 1)) + intersection = torch.sum(mask1 * mask2, dim=(0, 1)) area1 = torch.sum(mask1, dim=(0, 1)) area2 = torch.sum(mask2, dim=(0, 1)) union = (area1 + area2) - intersection @@ -689,6 +838,6 @@ def mask_iou_loss(self, net, maskiou_targets): label_t = label_t[:, None] maskiou_p = torch.gather(maskiou_p, dim=1, index=label_t).view(-1) - loss_i = F.smooth_l1_loss(maskiou_p, maskiou_t, reduction='sum') - + loss_i = F.smooth_l1_loss(maskiou_p, maskiou_t, reduction="sum") + return loss_i * cfg.maskiou_alpha diff --git a/layers/output_utils.py b/layers/output_utils.py index 27efac935..15445a130 100644 --- a/layers/output_utils.py +++ b/layers/output_utils.py @@ -12,8 +12,17 @@ from utils import timer from .box_utils import crop, sanitize_coordinates -def postprocess(det_output, w, h, batch_idx=0, interpolation_mode='bilinear', - visualize_lincomb=False, crop_masks=True, score_threshold=0): + +def postprocess( + det_output, + w, + h, + batch_idx=0, + interpolation_mode="bilinear", + visualize_lincomb=False, + crop_masks=True, + score_threshold=0, +): """ Postprocesses the output of Yolact on testing mode into a format that makes sense, accounting for all the possible configuration settings. @@ -31,38 +40,38 @@ def postprocess(det_output, w, h, batch_idx=0, interpolation_mode='bilinear', - boxes [num_det, 4]: The bounding box for each detection in absolute point form. - masks [num_det, h, w]: Full image masks for each detection. """ - + dets = det_output[batch_idx] - net = dets['net'] - dets = dets['detection'] + net = dets["net"] + dets = dets["detection"] if dets is None: - return [torch.Tensor()] * 4 # Warning, this is 4 copies of the same thing + return [torch.Tensor()] * 4 # Warning, this is 4 copies of the same thing if score_threshold > 0: - keep = dets['score'] > score_threshold + keep = dets["score"] > score_threshold for k in dets: - if k != 'proto': + if k != "proto": dets[k] = dets[k][keep] - - if dets['score'].size(0) == 0: + + if dets["score"].size(0) == 0: return [torch.Tensor()] * 4 - + # Actually extract everything from dets now - classes = dets['class'] - boxes = dets['box'] - scores = dets['score'] - masks = dets['mask'] + classes = dets["class"] + boxes = dets["box"] + scores = dets["score"] + masks = dets["mask"] if cfg.mask_type == mask_type.lincomb and cfg.eval_mask_branch: # At this points masks is only the coefficients - proto_data = dets['proto'] - + proto_data = dets["proto"] + # Test flag, do not upvote if cfg.mask_proto_debug: - np.save('scripts/proto.npy', proto_data.cpu().numpy()) - + np.save("scripts/proto.npy", proto_data.cpu().numpy()) + if visualize_lincomb: display_lincomb(proto_data, masks) @@ -77,10 +86,12 @@ def postprocess(det_output, w, h, batch_idx=0, interpolation_mode='bilinear', masks = masks.permute(2, 0, 1).contiguous() if cfg.use_maskiou: - with timer.env('maskiou_net'): + with timer.env("maskiou_net"): with torch.no_grad(): maskiou_p = net.maskiou_net(masks.unsqueeze(1)) - maskiou_p = torch.gather(maskiou_p, dim=1, index=classes.unsqueeze(1)).squeeze(1) + maskiou_p = torch.gather( + maskiou_p, dim=1, index=classes.unsqueeze(1) + ).squeeze(1) if cfg.rescore_mask: if cfg.rescore_bbox: scores = scores * maskiou_p @@ -88,14 +99,19 @@ def postprocess(det_output, w, h, batch_idx=0, interpolation_mode='bilinear', scores = [scores, scores * maskiou_p] # Scale masks up to the full image - masks = F.interpolate(masks.unsqueeze(0), (h, w), mode=interpolation_mode, align_corners=False).squeeze(0) + masks = F.interpolate( + masks.unsqueeze(0), (h, w), mode=interpolation_mode, align_corners=False + ).squeeze(0) # Binarize the masks masks.gt_(0.5) - - boxes[:, 0], boxes[:, 2] = sanitize_coordinates(boxes[:, 0], boxes[:, 2], w, cast=False) - boxes[:, 1], boxes[:, 3] = sanitize_coordinates(boxes[:, 1], boxes[:, 3], h, cast=False) + boxes[:, 0], boxes[:, 2] = sanitize_coordinates( + boxes[:, 0], boxes[:, 2], w, cast=False + ) + boxes[:, 1], boxes[:, 3] = sanitize_coordinates( + boxes[:, 1], boxes[:, 3], h, cast=False + ) boxes = boxes.long() if cfg.mask_type == mask_type.direct and cfg.eval_mask_branch: @@ -111,37 +127,36 @@ def postprocess(det_output, w, h, batch_idx=0, interpolation_mode='bilinear', # Just in case if mask_w * mask_h <= 0 or mask_w < 0: continue - + mask = masks[jdx, :].view(1, 1, cfg.mask_size, cfg.mask_size) - mask = F.interpolate(mask, (mask_h, mask_w), mode=interpolation_mode, align_corners=False) + mask = F.interpolate( + mask, (mask_h, mask_w), mode=interpolation_mode, align_corners=False + ) mask = mask.gt(0.5).float() full_masks[jdx, y1:y2, x1:x2] = mask - + masks = full_masks return classes, scores, boxes, masks - - - def undo_image_transformation(img, w, h): """ Takes a transformed image tensor and returns a numpy ndarray that is untransformed. Arguments w and h are the original height and width of the image. """ img_numpy = img.permute(1, 2, 0).cpu().numpy() - img_numpy = img_numpy[:, :, (2, 1, 0)] # To BRG + img_numpy = img_numpy[:, :, (2, 1, 0)] # To BRG if cfg.backbone.transform.normalize: img_numpy = (img_numpy * np.array(STD) + np.array(MEANS)) / 255.0 elif cfg.backbone.transform.subtract_means: img_numpy = (img_numpy / 255.0 + np.array(MEANS) / 255.0).astype(np.float32) - - img_numpy = img_numpy[:, :, (2, 1, 0)] # To RGB + + img_numpy = img_numpy[:, :, (2, 1, 0)] # To RGB img_numpy = np.clip(img_numpy, 0, 1) - return cv2.resize(img_numpy, (w,h)) + return cv2.resize(img_numpy, (w, h)) def display_lincomb(proto_data, masks): @@ -151,16 +166,17 @@ def display_lincomb(proto_data, masks): for kdx in range(1): jdx = kdx + 0 import matplotlib.pyplot as plt + coeffs = masks[jdx, :].cpu().numpy() idx = np.argsort(-np.abs(coeffs)) # plt.bar(list(range(idx.shape[0])), coeffs[idx]) # plt.show() - + coeffs_sort = coeffs[idx] - arr_h, arr_w = (4,8) + arr_h, arr_w = (4, 8) proto_h, proto_w, _ = proto_data.size() - arr_img = np.zeros([proto_h*arr_h, proto_w*arr_w]) - arr_run = np.zeros([proto_h*arr_h, proto_w*arr_w]) + arr_img = np.zeros([proto_h * arr_h, proto_w * arr_w]) + arr_run = np.zeros([proto_h * arr_h, proto_w * arr_w]) test = torch.sum(proto_data, -1).cpu().numpy() for y in range(arr_h): @@ -168,16 +184,29 @@ def display_lincomb(proto_data, masks): i = arr_w * y + x if i == 0: - running_total = proto_data[:, :, idx[i]].cpu().numpy() * coeffs_sort[i] + running_total = ( + proto_data[:, :, idx[i]].cpu().numpy() * coeffs_sort[i] + ) else: - running_total += proto_data[:, :, idx[i]].cpu().numpy() * coeffs_sort[i] + running_total += ( + proto_data[:, :, idx[i]].cpu().numpy() * coeffs_sort[i] + ) running_total_nonlin = running_total if cfg.mask_proto_mask_activation == activation_func.sigmoid: - running_total_nonlin = (1/(1+np.exp(-running_total_nonlin))) - - arr_img[y*proto_h:(y+1)*proto_h, x*proto_w:(x+1)*proto_w] = (proto_data[:, :, idx[i]] / torch.max(proto_data[:, :, idx[i]])).cpu().numpy() * coeffs_sort[i] - arr_run[y*proto_h:(y+1)*proto_h, x*proto_w:(x+1)*proto_w] = (running_total_nonlin > 0.5).astype(np.float) + running_total_nonlin = 1 / (1 + np.exp(-running_total_nonlin)) + + arr_img[ + y * proto_h : (y + 1) * proto_h, x * proto_w : (x + 1) * proto_w + ] = ( + (proto_data[:, :, idx[i]] / torch.max(proto_data[:, :, idx[i]])) + .cpu() + .numpy() + * coeffs_sort[i] + ) + arr_run[ + y * proto_h : (y + 1) * proto_h, x * proto_w : (x + 1) * proto_w + ] = (running_total_nonlin > 0.5).astype(np.float) plt.imshow(arr_img) plt.show() # plt.imshow(arr_run) diff --git a/run_coco_eval.py b/run_coco_eval.py index 27cb4d603..ea803bd53 100644 --- a/run_coco_eval.py +++ b/run_coco_eval.py @@ -10,40 +10,40 @@ from pycocotools.cocoeval import COCOeval -parser = argparse.ArgumentParser(description='COCO Detections Evaluator') -parser.add_argument('--bbox_det_file', default='results/bbox_detections.json', type=str) -parser.add_argument('--mask_det_file', default='results/mask_detections.json', type=str) -parser.add_argument('--gt_ann_file', default='data/coco/annotations/instances_val2017.json', type=str) -parser.add_argument('--eval_type', default='both', choices=['bbox', 'mask', 'both'], type=str) +parser = argparse.ArgumentParser(description="COCO Detections Evaluator") +parser.add_argument("--bbox_det_file", default="results/bbox_detections.json", type=str) +parser.add_argument("--mask_det_file", default="results/mask_detections.json", type=str) +parser.add_argument( + "--gt_ann_file", default="data/coco/annotations/instances_val2017.json", type=str +) +parser.add_argument( + "--eval_type", default="both", choices=["bbox", "mask", "both"], type=str +) args = parser.parse_args() +if __name__ == "__main__": -if __name__ == '__main__': - - eval_bbox = (args.eval_type in ('bbox', 'both')) - eval_mask = (args.eval_type in ('mask', 'both')) - - print('Loading annotations...') - gt_annotations = COCO(args.gt_ann_file) - if eval_bbox: - bbox_dets = gt_annotations.loadRes(args.bbox_det_file) - if eval_mask: - mask_dets = gt_annotations.loadRes(args.mask_det_file) - - if eval_bbox: - print('\nEvaluating BBoxes:') - bbox_eval = COCOeval(gt_annotations, bbox_dets, 'bbox') - bbox_eval.evaluate() - bbox_eval.accumulate() - bbox_eval.summarize() - - if eval_mask: - print('\nEvaluating Masks:') - bbox_eval = COCOeval(gt_annotations, mask_dets, 'segm') - bbox_eval.evaluate() - bbox_eval.accumulate() - bbox_eval.summarize() + eval_bbox = args.eval_type in ("bbox", "both") + eval_mask = args.eval_type in ("mask", "both") + print("Loading annotations...") + gt_annotations = COCO(args.gt_ann_file) + if eval_bbox: + bbox_dets = gt_annotations.loadRes(args.bbox_det_file) + if eval_mask: + mask_dets = gt_annotations.loadRes(args.mask_det_file) + if eval_bbox: + print("\nEvaluating BBoxes:") + bbox_eval = COCOeval(gt_annotations, bbox_dets, "bbox") + bbox_eval.evaluate() + bbox_eval.accumulate() + bbox_eval.summarize() + if eval_mask: + print("\nEvaluating Masks:") + bbox_eval = COCOeval(gt_annotations, mask_dets, "segm") + bbox_eval.evaluate() + bbox_eval.accumulate() + bbox_eval.summarize() diff --git a/scripts/augment_bbox.py b/scripts/augment_bbox.py index 40c7b8b32..cf0b8be3c 100644 --- a/scripts/augment_bbox.py +++ b/scripts/augment_bbox.py @@ -1,4 +1,3 @@ - import os.path as osp import json, pickle import sys @@ -12,82 +11,78 @@ max_image_size = 550 augment_idx = 0 -dump_file = 'weights/bboxes_aug.pkl' -box_file = 'weights/bboxes.pkl' - -def augment_boxes(bboxes): - bboxes_rel = [] - for box in bboxes: - bboxes_rel.append(prep_box(box)) - bboxes_rel = np.concatenate(bboxes_rel, axis=0) - - with open(dump_file, 'wb') as f: - pickle.dump(bboxes_rel, f) - -def prep_box(box_list): - global augment_idx - boxes = np.array([box_list[2:]], dtype=np.float32) +dump_file = "weights/bboxes_aug.pkl" +box_file = "weights/bboxes.pkl" - # Image width and height - width, height = box_list[:2] - - # To point form - boxes[:, 2:] += boxes[:, :2] +def augment_boxes(bboxes): + bboxes_rel = [] + for box in bboxes: + bboxes_rel.append(prep_box(box)) + bboxes_rel = np.concatenate(bboxes_rel, axis=0) - # Expand - ratio = random.uniform(1, 4) - left = random.uniform(0, width*ratio - width) - top = random.uniform(0, height*ratio - height) + with open(dump_file, "wb") as f: + pickle.dump(bboxes_rel, f) - height *= ratio - width *= ratio - boxes[:, :2] += (int(left), int(top)) - boxes[:, 2:] += (int(left), int(top)) +def prep_box(box_list): + global augment_idx + boxes = np.array([box_list[2:]], dtype=np.float32) + # Image width and height + width, height = box_list[:2] - # RandomSampleCrop - height, width, boxes = random_sample_crop(height, width, boxes) + # To point form + boxes[:, 2:] += boxes[:, :2] + # Expand + ratio = random.uniform(1, 4) + left = random.uniform(0, width * ratio - width) + top = random.uniform(0, height * ratio - height) - # RandomMirror - if random.randint(0, 2): - boxes[:, 0::2] = width - boxes[:, 2::-2] + height *= ratio + width *= ratio - - # Resize - boxes[:, [0, 2]] *= (max_image_size / width) - boxes[:, [1, 3]] *= (max_image_size / height) - width = height = max_image_size + boxes[:, :2] += (int(left), int(top)) + boxes[:, 2:] += (int(left), int(top)) + # RandomSampleCrop + height, width, boxes = random_sample_crop(height, width, boxes) - # ToPercentCoords - boxes[:, [0, 2]] /= width - boxes[:, [1, 3]] /= height + # RandomMirror + if random.randint(0, 2): + boxes[:, 0::2] = width - boxes[:, 2::-2] - if augment_idx % 50000 == 0: - print('Current idx: %d' % augment_idx) + # Resize + boxes[:, [0, 2]] *= max_image_size / width + boxes[:, [1, 3]] *= max_image_size / height + width = height = max_image_size - augment_idx += 1 + # ToPercentCoords + boxes[:, [0, 2]] /= width + boxes[:, [1, 3]] /= height - return boxes + if augment_idx % 50000 == 0: + print("Current idx: %d" % augment_idx) + augment_idx += 1 + return boxes sample_options = ( - # using entire original input image - None, - # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9 - (0.1, None), - (0.3, None), - (0.7, None), - (0.9, None), - # randomly sample a patch - (None, None), + # using entire original input image + None, + # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9 + (0.1, None), + (0.3, None), + (0.7, None), + (0.9, None), + # randomly sample a patch + (None, None), ) + def intersect(box_a, box_b): max_xy = np.minimum(box_a[:, 2:], box_b[2:]) min_xy = np.maximum(box_a[:, :2], box_b[:2]) @@ -107,65 +102,63 @@ def jaccard_numpy(box_a, box_b): jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]] """ inter = intersect(box_a, box_b) - area_a = ((box_a[:, 2]-box_a[:, 0]) * - (box_a[:, 3]-box_a[:, 1])) # [A,B] - area_b = ((box_b[2]-box_b[0]) * - (box_b[3]-box_b[1])) # [A,B] + area_a = (box_a[:, 2] - box_a[:, 0]) * (box_a[:, 3] - box_a[:, 1]) # [A,B] + area_b = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]) # [A,B] union = area_a + area_b - inter return inter / union # [A,B] def random_sample_crop(height, width, boxes=None): - global sample_options - - while True: - # randomly choose a mode - mode = random.choice(sample_options) - if mode is None: - return height, width, boxes + global sample_options + + while True: + # randomly choose a mode + mode = random.choice(sample_options) + if mode is None: + return height, width, boxes + + min_iou, max_iou = mode + if min_iou is None: + min_iou = float("-inf") + if max_iou is None: + max_iou = float("inf") - min_iou, max_iou = mode - if min_iou is None: - min_iou = float('-inf') - if max_iou is None: - max_iou = float('inf') + for _ in range(50): + w = random.uniform(0.3 * width, width) + h = random.uniform(0.3 * height, height) - for _ in range(50): - w = random.uniform(0.3 * width, width) - h = random.uniform(0.3 * height, height) + if h / w < 0.5 or h / w > 2: + continue - if h / w < 0.5 or h / w > 2: - continue + left = random.uniform(0, width - w) + top = random.uniform(0, height - h) - left = random.uniform(0, width - w) - top = random.uniform(0, height - h) + rect = np.array([int(left), int(top), int(left + w), int(top + h)]) + overlap = jaccard_numpy(boxes, rect) + if overlap.min() < min_iou and max_iou < overlap.max(): + continue - rect = np.array([int(left), int(top), int(left+w), int(top+h)]) - overlap = jaccard_numpy(boxes, rect) - if overlap.min() < min_iou and max_iou < overlap.max(): - continue + centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0 - centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0 + m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1]) + m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1]) + mask = m1 * m2 - m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1]) - m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1]) - mask = m1 * m2 + if not mask.any(): + continue - if not mask.any(): - continue + current_boxes = boxes[mask, :].copy() + current_boxes[:, :2] = np.maximum(current_boxes[:, :2], rect[:2]) + current_boxes[:, :2] -= rect[:2] + current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:], rect[2:]) + current_boxes[:, 2:] -= rect[:2] - current_boxes = boxes[mask, :].copy() - current_boxes[:, :2] = np.maximum(current_boxes[:, :2], rect[:2]) - current_boxes[:, :2] -= rect[:2] - current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:], rect[2:]) - current_boxes[:, 2:] -= rect[:2] + return h, w, current_boxes - return h, w, current_boxes +if __name__ == "__main__": -if __name__ == '__main__': - - with open(box_file, 'rb') as f: - bboxes = pickle.load(f) + with open(box_file, "rb") as f: + bboxes = pickle.load(f) - augment_boxes(bboxes) + augment_boxes(bboxes) diff --git a/scripts/bbox_recall.py b/scripts/bbox_recall.py index 3fe26259e..94155ab61 100644 --- a/scripts/bbox_recall.py +++ b/scripts/bbox_recall.py @@ -15,8 +15,8 @@ import numpy as np -dump_file = 'weights/bboxes.pkl' -aug_file = 'weights/bboxes_aug.pkl' +dump_file = "weights/bboxes.pkl" +aug_file = "weights/bboxes_aug.pkl" use_augmented_boxes = True @@ -34,10 +34,14 @@ def intersect(box_a, box_b): """ A = box_a.size(0) B = box_b.size(0) - max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), - box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) - min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), - box_b[:, :2].unsqueeze(0).expand(A, B, 2)) + max_xy = torch.min( + box_a[:, 2:].unsqueeze(1).expand(A, B, 2), + box_b[:, 2:].unsqueeze(0).expand(A, B, 2), + ) + min_xy = torch.max( + box_a[:, :2].unsqueeze(1).expand(A, B, 2), + box_b[:, :2].unsqueeze(0).expand(A, B, 2), + ) inter = torch.clamp((max_xy - min_xy), min=0) return inter[:, :, 0] * inter[:, :, 1] @@ -55,10 +59,16 @@ def jaccard(box_a, box_b, iscrowd=False): jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] """ inter = intersect(box_a, box_b) - area_a = ((box_a[:, 2]-box_a[:, 0]) * - (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] - area_b = ((box_b[:, 2]-box_b[:, 0]) * - (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] + area_a = ( + ((box_a[:, 2] - box_a[:, 0]) * (box_a[:, 3] - box_a[:, 1])) + .unsqueeze(1) + .expand_as(inter) + ) # [A,B] + area_b = ( + ((box_b[:, 2] - box_b[:, 0]) * (box_b[:, 3] - box_b[:, 1])) + .unsqueeze(0) + .expand_as(inter) + ) # [A,B] union = area_a + area_b - inter if iscrowd: @@ -66,9 +76,16 @@ def jaccard(box_a, box_b, iscrowd=False): else: return inter / union # [A,B] + # Also convert to point form def to_relative(bboxes): - return np.concatenate((bboxes[:, 2:4] / bboxes[:, :2], (bboxes[:, 2:4] + bboxes[:, 4:]) / bboxes[:, :2]), axis=1) + return np.concatenate( + ( + bboxes[:, 2:4] / bboxes[:, :2], + (bboxes[:, 2:4] + bboxes[:, 4:]) / bboxes[:, :2], + ), + axis=1, + ) def make_priors(conv_size, scales, aspect_ratios): @@ -80,34 +97,39 @@ def make_priors(conv_size, scales, aspect_ratios): for j, i in product(range(conv_h), range(conv_w)): x = (i + 0.5) / conv_w y = (j + 0.5) / conv_h - + for scale, ars in zip(scales, aspect_ratios): for ar in ars: w = scale * ar / conv_w h = scale / ar / conv_h # Point form - prior_data += [x - w/2, y - h/2, x + w/2, y + h/2] - + prior_data += [x - w / 2, y - h / 2, x + w / 2, y + h / 2] + return np.array(prior_data).reshape(-1, 4) + # fixed_ssd_config # scales = [[3.5, 4.95], [3.6, 4.90], [3.3, 4.02], [2.7, 3.10], [2.1, 2.37], [2.1, 2.37], [1.8, 1.92]] # aspect_ratios = [ [[1, sqrt(2), 1/sqrt(2), sqrt(3), 1/sqrt(3)][:n], [1]] for n in [3, 5, 5, 5, 3, 3, 3] ] # conv_sizes = [(35, 35), (18, 18), (9, 9), (5, 5), (3, 3), (2, 2)] -scales = [[1.68, 2.91], - [2.95, 2.22, 0.84], - [2.23, 2.17, 3.12], - [0.76, 1.94, 2.72], - [2.10, 2.65], - [1.80, 1.92]] -aspect_ratios = [[[0.72, 0.96], [0.68, 1.17]], - [[1.28, 0.66], [0.63, 1.23], [0.89, 1.40]], - [[2.05, 1.24], [0.57, 0.83], [0.61, 1.15]], - [[1.00, 2.21], [0.47, 1.60], [1.44, 0.79]], - [[1.00, 1.41, 0.71, 1.73, 0.58], [1.08]], - [[1.00, 1.41, 0.71, 1.73, 0.58], [1.00]]] +scales = [ + [1.68, 2.91], + [2.95, 2.22, 0.84], + [2.23, 2.17, 3.12], + [0.76, 1.94, 2.72], + [2.10, 2.65], + [1.80, 1.92], +] +aspect_ratios = [ + [[0.72, 0.96], [0.68, 1.17]], + [[1.28, 0.66], [0.63, 1.23], [0.89, 1.40]], + [[2.05, 1.24], [0.57, 0.83], [0.61, 1.15]], + [[1.00, 2.21], [0.47, 1.60], [1.44, 0.79]], + [[1.00, 1.41, 0.71, 1.73, 0.58], [1.08]], + [[1.00, 1.41, 0.71, 1.73, 0.58], [1.00]], +] conv_sizes = [(35, 35), (18, 18), (9, 9), (5, 5), (3, 3), (2, 2)] # yrm33_config @@ -120,9 +142,9 @@ def make_priors(conv_size, scales, aspect_ratios): MEDIUM = 1 LARGE = 2 -if __name__ == '__main__': - - with open(dump_file, 'rb') as f: +if __name__ == "__main__": + + with open(dump_file, "rb") as f: bboxes = pickle.load(f) sizes = [] @@ -138,18 +160,20 @@ def make_priors(conv_size, scales, aspect_ratios): sizes.append(LARGE) # Each box is in the form [im_w, im_h, pos_x, pos_y, size_x, size_y] - + if use_augmented_boxes: - with open(aug_file, 'rb') as f: + with open(aug_file, "rb") as f: bboxes_rel = pickle.load(f) else: bboxes_rel = to_relative(np.array(bboxes)) - with torch.no_grad(): sizes = torch.Tensor(sizes) - anchors = [make_priors(cs, s, ar) for cs, s, ar in zip(conv_sizes, scales, aspect_ratios)] + anchors = [ + make_priors(cs, s, ar) + for cs, s, ar in zip(conv_sizes, scales, aspect_ratios) + ] anchors = np.concatenate(anchors, axis=0) anchors = torch.Tensor(anchors).cuda() @@ -159,23 +183,19 @@ def make_priors(conv_size, scales, aspect_ratios): chunk_size = 1000 for i in range((bboxes_rel.size(0) // chunk_size) + 1): start = i * chunk_size - end = min((i + 1) * chunk_size, bboxes_rel.size(0)) - + end = min((i + 1) * chunk_size, bboxes_rel.size(0)) + ious = jaccard(bboxes_rel[start:end, :], anchors) maxes, maxidx = torch.max(ious, dim=1) perGTAnchorMax[start:end] = maxes - hits = (perGTAnchorMax > 0.5).float() - print('Total recall: %.2f' % (torch.sum(hits) / hits.size(0) * 100)) + print("Total recall: %.2f" % (torch.sum(hits) / hits.size(0) * 100)) print() - for i, metric in zip(range(3), ('small', 'medium', 'large')): + for i, metric in zip(range(3), ("small", "medium", "large")): _hits = hits[sizes == i] - _size = (1 if _hits.size(0) == 0 else _hits.size(0)) - print(metric + ' recall: %.2f' % ((torch.sum(_hits) / _size) * 100)) - - - + _size = 1 if _hits.size(0) == 0 else _hits.size(0) + print(metric + " recall: %.2f" % ((torch.sum(_hits) / _size) * 100)) diff --git a/scripts/cluster_bbox_sizes.py b/scripts/cluster_bbox_sizes.py index 91285ce9d..cbf5afc44 100644 --- a/scripts/cluster_bbox_sizes.py +++ b/scripts/cluster_bbox_sizes.py @@ -12,58 +12,59 @@ import numpy as np import sklearn.cluster as cluster -dump_file = 'weights/bboxes.pkl' +dump_file = "weights/bboxes.pkl" max_size = 550 num_scale_clusters = 5 num_aspect_ratio_clusters = 3 + def to_relative(bboxes): - return bboxes[:, 2:4] / bboxes[:, :2] + return bboxes[:, 2:4] / bboxes[:, :2] + def process(bboxes): - return to_relative(bboxes) * max_size + return to_relative(bboxes) * max_size -if __name__ == '__main__': - - with open(dump_file, 'rb') as f: - bboxes = pickle.load(f) - bboxes = np.array(bboxes) - bboxes = process(bboxes) - bboxes = bboxes[(bboxes[:, 0] > 1) * (bboxes[:, 1] > 1)] +if __name__ == "__main__": - scale = np.sqrt(bboxes[:, 0] * bboxes[:, 1]).reshape(-1, 1) + with open(dump_file, "rb") as f: + bboxes = pickle.load(f) - clusterer = cluster.KMeans(num_scale_clusters, random_state=99, n_jobs=4) - assignments = clusterer.fit_predict(scale) - counts = np.bincount(assignments) + bboxes = np.array(bboxes) + bboxes = process(bboxes) + bboxes = bboxes[(bboxes[:, 0] > 1) * (bboxes[:, 1] > 1)] - cluster_centers = clusterer.cluster_centers_ + scale = np.sqrt(bboxes[:, 0] * bboxes[:, 1]).reshape(-1, 1) - center_indices = list(range(num_scale_clusters)) - center_indices.sort(key=lambda x: cluster_centers[x, 0]) + clusterer = cluster.KMeans(num_scale_clusters, random_state=99, n_jobs=4) + assignments = clusterer.fit_predict(scale) + counts = np.bincount(assignments) - for idx in center_indices: - center = cluster_centers[idx, 0] - boxes_for_center = bboxes[assignments == idx] - aspect_ratios = (boxes_for_center[:,0] / boxes_for_center[:,1]).reshape(-1, 1) + cluster_centers = clusterer.cluster_centers_ - c = cluster.KMeans(num_aspect_ratio_clusters, random_state=idx, n_jobs=4) - ca = c.fit_predict(aspect_ratios) - cc = np.bincount(ca) + center_indices = list(range(num_scale_clusters)) + center_indices.sort(key=lambda x: cluster_centers[x, 0]) - c = list(c.cluster_centers_.reshape(-1)) - cidx = list(range(num_aspect_ratio_clusters)) - cidx.sort(key=lambda x: -cc[x]) + for idx in center_indices: + center = cluster_centers[idx, 0] + boxes_for_center = bboxes[assignments == idx] + aspect_ratios = (boxes_for_center[:, 0] / boxes_for_center[:, 1]).reshape(-1, 1) - # import code - # code.interact(local=locals()) + c = cluster.KMeans(num_aspect_ratio_clusters, random_state=idx, n_jobs=4) + ca = c.fit_predict(aspect_ratios) + cc = np.bincount(ca) - print('%.3f (%d) aspect ratios:' % (center, counts[idx])) - for idx in cidx: - print('\t%.2f (%d)' % (c[idx], cc[idx])) - print() - # exit() + c = list(c.cluster_centers_.reshape(-1)) + cidx = list(range(num_aspect_ratio_clusters)) + cidx.sort(key=lambda x: -cc[x]) + # import code + # code.interact(local=locals()) + print("%.3f (%d) aspect ratios:" % (center, counts[idx])) + for idx in cidx: + print("\t%.2f (%d)" % (c[idx], cc[idx])) + print() + # exit() diff --git a/scripts/compute_masks.py b/scripts/compute_masks.py index 866741800..a2be83bb1 100644 --- a/scripts/compute_masks.py +++ b/scripts/compute_masks.py @@ -4,8 +4,15 @@ import torch import torch.nn.functional as F -COLORS = ((255, 0, 0, 128), (0, 255, 0, 128), (0, 0, 255, 128), - (0, 255, 255, 128), (255, 0, 255, 128), (255, 255, 0, 128)) +COLORS = ( + (255, 0, 0, 128), + (0, 255, 0, 128), + (0, 0, 255, 128), + (0, 255, 255, 128), + (255, 0, 255, 128), + (255, 255, 0, 128), +) + def mask_iou(mask1, mask2): """ @@ -19,43 +26,49 @@ def mask_iou(mask1, mask2): return intersection / union + def paint_mask(img_numpy, mask, color): - h, w, _ = img_numpy.shape - img_numpy = img_numpy.copy() + h, w, _ = img_numpy.shape + img_numpy = img_numpy.copy() + + mask = np.tile(mask.reshape(h, w, 1), (1, 1, 3)) + color_np = np.array(color[:3]).reshape(1, 1, 3) + color_np = np.tile(color_np, (h, w, 1)) + mask_color = mask * color_np - mask = np.tile(mask.reshape(h, w, 1), (1, 1, 3)) - color_np = np.array(color[:3]).reshape(1, 1, 3) - color_np = np.tile(color_np, (h, w, 1)) - mask_color = mask * color_np + mask_alpha = 0.3 - mask_alpha = 0.3 + # Blend image and mask + image_crop = img_numpy * mask + img_numpy *= 1 - mask + img_numpy += image_crop * (1 - mask_alpha) + mask_color * mask_alpha - # Blend image and mask - image_crop = img_numpy * mask - img_numpy *= (1-mask) - img_numpy += image_crop * (1-mask_alpha) + mask_color * mask_alpha + return img_numpy - return img_numpy # Inverse sigmoid def logit(x): - return np.log(x / (1-x + 0.0001) + 0.0001) + return np.log(x / (1 - x + 0.0001) + 0.0001) + def sigmoid(x): - return 1 / (1 + np.exp(-x)) + return 1 / (1 + np.exp(-x)) + -img_fmt = '../data/coco/images/%012d.jpg' -with open('info.txt', 'r') as f: - img_id = int(f.read()) +img_fmt = "../data/coco/images/%012d.jpg" +with open("info.txt", "r") as f: + img_id = int(f.read()) img = plt.imread(img_fmt % img_id).astype(np.float32) h, w, _ = img.shape -gt_masks = np.load('gt.npy').astype(np.float32).transpose(1, 2, 0) -proto_masks = np.load('proto.npy').astype(np.float32) +gt_masks = np.load("gt.npy").astype(np.float32).transpose(1, 2, 0) +proto_masks = np.load("proto.npy").astype(np.float32) proto_masks = torch.Tensor(proto_masks).permute(2, 0, 1).contiguous().unsqueeze(0) -proto_masks = F.interpolate(proto_masks, (h, w), mode='bilinear', align_corners=False).squeeze(0) +proto_masks = F.interpolate( + proto_masks, (h, w), mode="bilinear", align_corners=False +).squeeze(0) proto_masks = proto_masks.permute(1, 2, 0).numpy() # # A x = b @@ -68,8 +81,10 @@ def sigmoid(x): approximated_masks = (np.matmul(proto_masks, x) > 0.5).astype(np.float32) num_gt = approximated_masks.shape[2] -ious = mask_iou(torch.Tensor(approximated_masks.reshape(-1, num_gt).T), - torch.Tensor(gt_masks.reshape(-1, num_gt).T)) +ious = mask_iou( + torch.Tensor(approximated_masks.reshape(-1, num_gt).T), + torch.Tensor(gt_masks.reshape(-1, num_gt).T), +) ious = [int(ious[i, i].item() * 100) for i in range(num_gt)] ious.sort(key=lambda x: -x) @@ -79,15 +94,15 @@ def sigmoid(x): gt_img = img.copy() for i in range(num_gt): - gt_img = paint_mask(gt_img, gt_masks[:, :, i], COLORS[i % len(COLORS)]) - + gt_img = paint_mask(gt_img, gt_masks[:, :, i], COLORS[i % len(COLORS)]) + plt.imshow(gt_img / 255) -plt.title('GT') +plt.title("GT") plt.show() for i in range(num_gt): - img = paint_mask(img, approximated_masks[:, :, i], COLORS[i % len(COLORS)]) + img = paint_mask(img, approximated_masks[:, :, i], COLORS[i % len(COLORS)]) plt.imshow(img / 255) -plt.title('Approximated') +plt.title("Approximated") plt.show() diff --git a/scripts/convert_darknet.py b/scripts/convert_darknet.py index b6b5cf4c2..eaf2c69b0 100644 --- a/scripts/convert_darknet.py +++ b/scripts/convert_darknet.py @@ -2,8 +2,8 @@ import h5py import torch -f = h5py.File('darknet53.h5', 'r') -m = f['model_weights'] +f = h5py.File("darknet53.h5", "r") +m = f["model_weights"] yolo_keys = list(m.keys()) yolo_keys = [x for x in yolo_keys if len(m[x].keys()) > 0] @@ -15,34 +15,33 @@ sd_keys.sort() # Note this won't work if there are 10 elements in some list but whatever that doesn't happen -layer_keys = list(set(['.'.join(x.split('.')[:-2]) for x in sd_keys])) +layer_keys = list(set([".".join(x.split(".")[:-2]) for x in sd_keys])) layer_keys.sort() # print([x for x in sd_keys if x.startswith(layer_keys[0])]) mapping = { - '.0.weight' : ('conv2d_%d', 'kernel:0'), - '.1.bias' : ('batch_normalization_%d', 'beta:0'), - '.1.weight' : ('batch_normalization_%d', 'gamma:0'), - '.1.running_var' : ('batch_normalization_%d', 'moving_variance:0'), - '.1.running_mean': ('batch_normalization_%d', 'moving_mean:0'), - '.1.num_batches_tracked': None, + ".0.weight": ("conv2d_%d", "kernel:0"), + ".1.bias": ("batch_normalization_%d", "beta:0"), + ".1.weight": ("batch_normalization_%d", "gamma:0"), + ".1.running_var": ("batch_normalization_%d", "moving_variance:0"), + ".1.running_mean": ("batch_normalization_%d", "moving_mean:0"), + ".1.num_batches_tracked": None, } for i, layer_key in zip(range(1, len(layer_keys) + 1), layer_keys): - # This is pretty inefficient but I don't care - for weight_key in [x for x in sd_keys if x.startswith(layer_key)]: - diff = weight_key[len(layer_key):] - - if mapping[diff] is not None: - yolo_key = mapping[diff][0] % i - sub_key = mapping[diff][1] - - yolo_weight = torch.Tensor(m[yolo_key][yolo_key][sub_key].value) - if (len(yolo_weight.size()) == 4): - yolo_weight = yolo_weight.permute(3, 2, 0, 1).contiguous() - - sd[weight_key] = yolo_weight - -torch.save(sd, 'weights/darknet53.pth') + # This is pretty inefficient but I don't care + for weight_key in [x for x in sd_keys if x.startswith(layer_key)]: + diff = weight_key[len(layer_key) :] + if mapping[diff] is not None: + yolo_key = mapping[diff][0] % i + sub_key = mapping[diff][1] + + yolo_weight = torch.Tensor(m[yolo_key][yolo_key][sub_key].value) + if len(yolo_weight.size()) == 4: + yolo_weight = yolo_weight.permute(3, 2, 0, 1).contiguous() + + sd[weight_key] = yolo_weight + +torch.save(sd, "weights/darknet53.pth") diff --git a/scripts/convert_sbd.py b/scripts/convert_sbd.py index 61f049a36..4b0486e72 100644 --- a/scripts/convert_sbd.py +++ b/scripts/convert_sbd.py @@ -3,6 +3,7 @@ import pycocotools.mask import numpy as np + def mask2bbox(mask): rows = np.any(mask, axis=1) cols = np.any(mask, axis=0) @@ -12,20 +13,19 @@ def mask2bbox(mask): return cmin, rmin, cmax - cmin, rmax - rmin - -inst_path = './inst/' -img_path = './img/' -img_name_fmt = '%s.jpg' -ann_name_fmt = '%s.mat' +inst_path = "./inst/" +img_path = "./img/" +img_name_fmt = "%s.jpg" +ann_name_fmt = "%s.mat" image_id = 1 -ann_id = 1 +ann_id = 1 -types = ['train', 'val'] +types = ["train", "val"] for t in types: - with open('%s.txt' % t, 'r') as f: - names = f.read().strip().split('\n') + with open("%s.txt" % t, "r") as f: + names = f.read().strip().split("\n") images = [] annotations = [] @@ -34,7 +34,7 @@ def mask2bbox(mask): img_name = img_name_fmt % name ann_path = os.path.join(inst_path, ann_name_fmt % name) - ann = scipy.io.loadmat(ann_path)['GTinst'][0][0] + ann = scipy.io.loadmat(ann_path)["GTinst"][0][0] classes = [int(x[0]) for x in ann[2]] seg = ann[0] @@ -43,46 +43,52 @@ def mask2bbox(mask): mask = (seg == (idx + 1)).astype(np.float) rle = pycocotools.mask.encode(np.asfortranarray(mask.astype(np.uint8))) - rle['counts'] = rle['counts'].decode('ascii') - - annotations.append({ - 'id': ann_id, - 'image_id': image_id, - 'category_id': classes[idx], - 'segmentation': rle, - 'area': float(mask.sum()), - 'bbox': [int(x) for x in mask2bbox(mask)], - 'iscrowd': 0 - }) + rle["counts"] = rle["counts"].decode("ascii") + + annotations.append( + { + "id": ann_id, + "image_id": image_id, + "category_id": classes[idx], + "segmentation": rle, + "area": float(mask.sum()), + "bbox": [int(x) for x in mask2bbox(mask)], + "iscrowd": 0, + } + ) ann_id += 1 - + img_name = img_name_fmt % name img = scipy.ndimage.imread(os.path.join(img_path, img_name)) - images.append({ - 'id': image_id, - 'width': img.shape[1], - 'height': img.shape[0], - 'file_name': img_name - }) + images.append( + { + "id": image_id, + "width": img.shape[1], + "height": img.shape[0], + "file_name": img_name, + } + ) image_id += 1 info = { - 'year': 2012, - 'version': 1, - 'description': 'Pascal SBD', + "year": 2012, + "version": 1, + "description": "Pascal SBD", } - categories = [{'id': x+1} for x in range(20)] - - with open('pascal_sbd_%s.json' % t, 'w') as f: - json.dump({ - 'info': info, - 'images': images, - 'annotations': annotations, - 'licenses': {}, - 'categories': categories - }, f) - + categories = [{"id": x + 1} for x in range(20)] + + with open("pascal_sbd_%s.json" % t, "w") as f: + json.dump( + { + "info": info, + "images": images, + "annotations": annotations, + "licenses": {}, + "categories": categories, + }, + f, + ) diff --git a/scripts/make_grid.py b/scripts/make_grid.py index 046329e18..33585eed6 100644 --- a/scripts/make_grid.py +++ b/scripts/make_grid.py @@ -9,7 +9,7 @@ plt.subplots_adjust(bottom=0.24) im_handle = None -save_path = 'grid.npy' +save_path = "grid.npy" center_x, center_y = (0.5, 0.5) grid_w, grid_h = (35, 35) @@ -24,181 +24,213 @@ # A hack disable_render = False -def render(): - if disable_render: - return - - x = np.tile(np.array(list(range(grid_w)), dtype=np.float).reshape(1, grid_w), [grid_h, 1]) - grid_w * center_x - y = np.tile(np.array(list(range(grid_h)), dtype=np.float).reshape(grid_h, 1), [1, grid_w]) - grid_h * center_y - - x /= scale - y /= scale - - a1 = angle + math.pi / 3 - a2 = -angle + math.pi / 3 - a3 = angle - - z1 = x * math.sin(a1) + y * math.cos(a1) - z2 = x * math.sin(a2) - y * math.cos(a2) - z3 = x * math.sin(a3) + y * math.cos(a3) - - s1 = np.square(np.sin(z1)) - s2 = np.square(np.sin(z2)) - s3 = np.square(np.sin(z3)) - line_1 = np.exp(s1 * spacing) * s1 - line_2 = np.exp(s2 * spacing) * s2 - line_3 = np.exp(s3 * spacing) * s3 - - global grid - grid = np.clip(1 - (line_1 + line_2 + line_3) / 3, 0, 1) +def render(): + if disable_render: + return + + x = ( + np.tile( + np.array(list(range(grid_w)), dtype=np.float).reshape(1, grid_w), + [grid_h, 1], + ) + - grid_w * center_x + ) + y = ( + np.tile( + np.array(list(range(grid_h)), dtype=np.float).reshape(grid_h, 1), + [1, grid_w], + ) + - grid_h * center_y + ) + + x /= scale + y /= scale + + a1 = angle + math.pi / 3 + a2 = -angle + math.pi / 3 + a3 = angle + + z1 = x * math.sin(a1) + y * math.cos(a1) + z2 = x * math.sin(a2) - y * math.cos(a2) + z3 = x * math.sin(a3) + y * math.cos(a3) + + s1 = np.square(np.sin(z1)) + s2 = np.square(np.sin(z2)) + s3 = np.square(np.sin(z3)) + + line_1 = np.exp(s1 * spacing) * s1 + line_2 = np.exp(s2 * spacing) * s2 + line_3 = np.exp(s3 * spacing) * s3 + + global grid + grid = np.clip(1 - (line_1 + line_2 + line_3) / 3, 0, 1) + + global im_handle + if im_handle is None: + im_handle = plt.imshow(grid) + else: + im_handle.set_data(grid) + fig.canvas.draw_idle() - global im_handle - if im_handle is None: - im_handle = plt.imshow(grid) - else: - im_handle.set_data(grid) - fig.canvas.draw_idle() def update_scale(val): - global scale - scale = val + global scale + scale = val + + render() - render() def update_angle(val): - global angle - angle = val + global angle + angle = val + + render() - render() def update_centerx(val): - global center_x - center_x = val + global center_x + center_x = val + + render() - render() def update_centery(val): - global center_y - center_y = val + global center_y + center_y = val + + render() - render() def update_spacing(val): - global spacing - spacing = val + global spacing + spacing = val + + render() - render() def randomize(val): - global center_x, center_y, spacing, scale, angle, disable_render + global center_x, center_y, spacing, scale, angle, disable_render - center_x, center_y = (random.uniform(0, 1), random.uniform(0, 1)) - spacing = random.uniform(-0.2, 2) - scale = 4 * math.exp(random.uniform(-1, 1)) - angle = random.uniform(-math.pi, math.pi) + center_x, center_y = (random.uniform(0, 1), random.uniform(0, 1)) + spacing = random.uniform(-0.2, 2) + scale = 4 * math.exp(random.uniform(-1, 1)) + angle = random.uniform(-math.pi, math.pi) - disable_render = True + disable_render = True - scale_slider.set_val(scale) - angle_slider.set_val(angle) - centx_slider.set_val(center_x) - centy_slider.set_val(center_y) - spaci_slider.set_val(spacing) + scale_slider.set_val(scale) + angle_slider.set_val(angle) + centx_slider.set_val(center_x) + centy_slider.set_val(center_y) + spaci_slider.set_val(spacing) - disable_render = False + disable_render = False - render() + render() -def add(val): - all_grids.append(grid) - global unique - if not unique: - unique = test_uniqueness(np.stack(all_grids)) - - export_len_text.set_text('Num Grids: ' + str(len(all_grids))) - fig.canvas.draw_idle() +def add(val): + all_grids.append(grid) -def add_randomize(val): - add(val) - randomize(val) + global unique + if not unique: + unique = test_uniqueness(np.stack(all_grids)) -def export(val): - np.save(save_path, np.stack(all_grids)) - print('Saved %d grids to "%s"' % (len(all_grids), save_path)) + export_len_text.set_text("Num Grids: " + str(len(all_grids))) + fig.canvas.draw_idle() - global unique - unique = False - all_grids.clear() - export_len_text.set_text('Num Grids: ' + str(len(all_grids))) - fig.canvas.draw_idle() +def add_randomize(val): + add(val) + randomize(val) -def test_uniqueness(grids): - # Grids shape [ngrids, h, w] - grids = grids.reshape((-1, grid_h, grid_w)) - for y in range(grid_h): - for x in range(grid_h): - pixel_features = grids[:, y, x] +def export(val): + np.save(save_path, np.stack(all_grids)) + print('Saved %d grids to "%s"' % (len(all_grids), save_path)) - # l1 distance for this pixel with every other - l1_dist = np.sum(np.abs(grids - np.tile(pixel_features, grid_h*grid_w).reshape((-1, grid_h, grid_w))), axis=0) + global unique + unique = False + all_grids.clear() - # Equal if l1 distance is really small. Note that this will include this pixel - num_equal = np.sum((l1_dist < 0.0001).astype(np.int32)) + export_len_text.set_text("Num Grids: " + str(len(all_grids))) + fig.canvas.draw_idle() - if num_equal > 1: - print('Pixel at (%d, %d) has %d other pixel%s with the same representation.' % (x, y, num_equal-1, '' if num_equal==2 else 's')) - return False - - print('Each pixel has a distinct representation.') - return True +def test_uniqueness(grids): + # Grids shape [ngrids, h, w] + grids = grids.reshape((-1, grid_h, grid_w)) + + for y in range(grid_h): + for x in range(grid_h): + pixel_features = grids[:, y, x] + + # l1 distance for this pixel with every other + l1_dist = np.sum( + np.abs( + grids + - np.tile(pixel_features, grid_h * grid_w).reshape( + (-1, grid_h, grid_w) + ) + ), + axis=0, + ) + + # Equal if l1 distance is really small. Note that this will include this pixel + num_equal = np.sum((l1_dist < 0.0001).astype(np.int32)) + + if num_equal > 1: + print( + "Pixel at (%d, %d) has %d other pixel%s with the same representation." + % (x, y, num_equal - 1, "" if num_equal == 2 else "s") + ) + return False + + print("Each pixel has a distinct representation.") + return True render() -axis = plt.axes([0.22, 0.19, 0.59, 0.03], facecolor='lightgoldenrodyellow') -scale_slider = Slider(axis, 'Scale', 0.1, 20, valinit=scale, valstep=0.1) +axis = plt.axes([0.22, 0.19, 0.59, 0.03], facecolor="lightgoldenrodyellow") +scale_slider = Slider(axis, "Scale", 0.1, 20, valinit=scale, valstep=0.1) scale_slider.on_changed(update_scale) -axis = plt.axes([0.22, 0.15, 0.59, 0.03], facecolor='lightgoldenrodyellow') -angle_slider = Slider(axis, 'Angle', -math.pi, math.pi, valinit=angle, valstep=0.1) +axis = plt.axes([0.22, 0.15, 0.59, 0.03], facecolor="lightgoldenrodyellow") +angle_slider = Slider(axis, "Angle", -math.pi, math.pi, valinit=angle, valstep=0.1) angle_slider.on_changed(update_angle) -axis = plt.axes([0.22, 0.11, 0.59, 0.03], facecolor='lightgoldenrodyellow') -centx_slider = Slider(axis, 'Center X', 0, 1, valinit=center_x, valstep=0.05) +axis = plt.axes([0.22, 0.11, 0.59, 0.03], facecolor="lightgoldenrodyellow") +centx_slider = Slider(axis, "Center X", 0, 1, valinit=center_x, valstep=0.05) centx_slider.on_changed(update_centerx) -axis = plt.axes([0.22, 0.07, 0.59, 0.03], facecolor='lightgoldenrodyellow') -centy_slider = Slider(axis, 'Center Y', 0, 1, valinit=center_y, valstep=0.05) +axis = plt.axes([0.22, 0.07, 0.59, 0.03], facecolor="lightgoldenrodyellow") +centy_slider = Slider(axis, "Center Y", 0, 1, valinit=center_y, valstep=0.05) centy_slider.on_changed(update_centery) -axis = plt.axes([0.22, 0.03, 0.59, 0.03], facecolor='lightgoldenrodyellow') -spaci_slider = Slider(axis, 'Spacing', -1, 2, valinit=spacing, valstep=0.05) +axis = plt.axes([0.22, 0.03, 0.59, 0.03], facecolor="lightgoldenrodyellow") +spaci_slider = Slider(axis, "Spacing", -1, 2, valinit=spacing, valstep=0.05) spaci_slider.on_changed(update_spacing) -axis = plt.axes([0.8, 0.54, 0.15, 0.05], facecolor='lightgoldenrodyellow') -rando_button = Button(axis, 'Randomize') +axis = plt.axes([0.8, 0.54, 0.15, 0.05], facecolor="lightgoldenrodyellow") +rando_button = Button(axis, "Randomize") rando_button.on_clicked(randomize) -axis = plt.axes([0.8, 0.48, 0.15, 0.05], facecolor='lightgoldenrodyellow') -addgr_button = Button(axis, 'Add') +axis = plt.axes([0.8, 0.48, 0.15, 0.05], facecolor="lightgoldenrodyellow") +addgr_button = Button(axis, "Add") addgr_button.on_clicked(add) # Likely not a good way to do this but whatever -export_len_text = plt.text(0, 3, 'Num Grids: 0') +export_len_text = plt.text(0, 3, "Num Grids: 0") -axis = plt.axes([0.8, 0.42, 0.15, 0.05], facecolor='lightgoldenrodyellow') -addra_button = Button(axis, 'Add / Rand') +axis = plt.axes([0.8, 0.42, 0.15, 0.05], facecolor="lightgoldenrodyellow") +addra_button = Button(axis, "Add / Rand") addra_button.on_clicked(add_randomize) -axis = plt.axes([0.8, 0.36, 0.15, 0.05], facecolor='lightgoldenrodyellow') -saveg_button = Button(axis, 'Save') +axis = plt.axes([0.8, 0.36, 0.15, 0.05], facecolor="lightgoldenrodyellow") +saveg_button = Button(axis, "Save") saveg_button.on_clicked(export) - plt.show() diff --git a/scripts/optimize_bboxes.py b/scripts/optimize_bboxes.py index 88e33b173..83466365a 100644 --- a/scripts/optimize_bboxes.py +++ b/scripts/optimize_bboxes.py @@ -15,8 +15,8 @@ import torch from scipy.optimize import minimize -dump_file = 'weights/bboxes.pkl' -aug_file = 'weights/bboxes_aug.pkl' +dump_file = "weights/bboxes.pkl" +aug_file = "weights/bboxes_aug.pkl" use_augmented_boxes = True @@ -34,10 +34,14 @@ def intersect(box_a, box_b): """ A = box_a.size(0) B = box_b.size(0) - max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), - box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) - min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), - box_b[:, :2].unsqueeze(0).expand(A, B, 2)) + max_xy = torch.min( + box_a[:, 2:].unsqueeze(1).expand(A, B, 2), + box_b[:, 2:].unsqueeze(0).expand(A, B, 2), + ) + min_xy = torch.max( + box_a[:, :2].unsqueeze(1).expand(A, B, 2), + box_b[:, :2].unsqueeze(0).expand(A, B, 2), + ) inter = torch.clamp((max_xy - min_xy), min=0) return inter[:, :, 0] * inter[:, :, 1] @@ -55,10 +59,16 @@ def jaccard(box_a, box_b, iscrowd=False): jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] """ inter = intersect(box_a, box_b) - area_a = ((box_a[:, 2]-box_a[:, 0]) * - (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] - area_b = ((box_b[:, 2]-box_b[:, 0]) * - (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] + area_a = ( + ((box_a[:, 2] - box_a[:, 0]) * (box_a[:, 3] - box_a[:, 1])) + .unsqueeze(1) + .expand_as(inter) + ) # [A,B] + area_b = ( + ((box_b[:, 2] - box_b[:, 0]) * (box_b[:, 3] - box_b[:, 1])) + .unsqueeze(0) + .expand_as(inter) + ) # [A,B] union = area_a + area_b - inter if iscrowd: @@ -66,9 +76,16 @@ def jaccard(box_a, box_b, iscrowd=False): else: return inter / union # [A,B] + # Also convert to point form def to_relative(bboxes): - return np.concatenate((bboxes[:, 2:4] / bboxes[:, :2], (bboxes[:, 2:4] + bboxes[:, 4:]) / bboxes[:, :2]), axis=1) + return np.concatenate( + ( + bboxes[:, 2:4] / bboxes[:, :2], + (bboxes[:, 2:4] + bboxes[:, 4:]) / bboxes[:, :2], + ), + axis=1, + ) def make_priors(conv_size, scales, aspect_ratios): @@ -80,20 +97,33 @@ def make_priors(conv_size, scales, aspect_ratios): for j, i in product(range(conv_h), range(conv_w)): x = (i + 0.5) / conv_w y = (j + 0.5) / conv_h - + for scale, ars in zip(scales, aspect_ratios): for ar in ars: w = scale * ar / conv_w h = scale / ar / conv_h # Point form - prior_data += [x - w/2, y - h/2, x + w/2, y + h/2] + prior_data += [x - w / 2, y - h / 2, x + w / 2, y + h / 2] return torch.Tensor(prior_data).view(-1, 4).cuda() - -scales = [[1.68, 2.91], [2.95, 2.22, 0.84], [2.17, 2.22, 3.22], [0.76, 2.06, 2.81], [5.33, 2.79], [13.69]] -aspect_ratios = [[[0.72, 0.96], [0.68, 1.17]], [[1.30, 0.66], [0.63, 1.23], [0.87, 1.41]], [[1.96, 1.23], [0.58, 0.84], [0.61, 1.15]], [[19.79, 2.21], [0.47, 1.76], [1.38, 0.79]], [[4.79, 17.96], [1.04]], [[14.82]]] +scales = [ + [1.68, 2.91], + [2.95, 2.22, 0.84], + [2.17, 2.22, 3.22], + [0.76, 2.06, 2.81], + [5.33, 2.79], + [13.69], +] +aspect_ratios = [ + [[0.72, 0.96], [0.68, 1.17]], + [[1.30, 0.66], [0.63, 1.23], [0.87, 1.41]], + [[1.96, 1.23], [0.58, 0.84], [0.61, 1.15]], + [[19.79, 2.21], [0.47, 1.76], [1.38, 0.79]], + [[4.79, 17.96], [1.04]], + [[14.82]], +] conv_sizes = [(35, 35), (18, 18), (9, 9), (5, 5), (3, 3), (2, 2)] optimize_scales = False @@ -104,8 +134,9 @@ def make_priors(conv_size, scales, aspect_ratios): def compute_hits(bboxes, anchors, iou_threshold=0.5): ious = jaccard(bboxes, anchors) perGTAnchorMax, _ = torch.max(ious, dim=1) - - return (perGTAnchorMax > iou_threshold) + + return perGTAnchorMax > iou_threshold + def compute_recall(hits, base_hits): hits = (hits | base_hits).float() @@ -116,7 +147,9 @@ def step(x, x_func, bboxes, base_hits, optim_idx): # This should set the scale and aspect ratio x_func(x, scales[optim_idx], aspect_ratios[optim_idx]) - anchors = make_priors(conv_sizes[optim_idx], scales[optim_idx], aspect_ratios[optim_idx]) + anchors = make_priors( + conv_sizes[optim_idx], scales[optim_idx], aspect_ratios[optim_idx] + ) return -float(compute_recall(compute_hits(bboxes, anchors), base_hits).cpu()) @@ -125,7 +158,7 @@ def optimize(full_bboxes, optim_idx, batch_size=5000): global batch_idx, scales, aspect_ratios, conv_sizes start = batch_idx * batch_size - end = min((batch_idx + 1) * batch_size, full_bboxes.size(0)) + end = min((batch_idx + 1) * batch_size, full_bboxes.size(0)) if batch_idx > (full_bboxes.size(0) // batch_size): batch_idx = 0 @@ -134,10 +167,11 @@ def optimize(full_bboxes, optim_idx, batch_size=5000): anchor_base = [ make_priors(conv_sizes[idx], scales[idx], aspect_ratios[idx]) - for idx in range(len(conv_sizes)) if idx != optim_idx] + for idx in range(len(conv_sizes)) + if idx != optim_idx + ] base_hits = compute_hits(bboxes, torch.cat(anchor_base, dim=0)) - - + def set_x(x, scales, aspect_ratios): if optimize_scales: for i in range(len(scales)): @@ -148,29 +182,33 @@ def set_x(x, scales, aspect_ratios): for j in range(len(aspect_ratios[i])): aspect_ratios[i][j] = x[k] k += 1 - - res = minimize(step, x0=scales[optim_idx] if optimize_scales else sum(aspect_ratios[optim_idx], []), method='Powell', - args = (set_x, bboxes, base_hits, optim_idx),) + res = minimize( + step, + x0=scales[optim_idx] if optimize_scales else sum(aspect_ratios[optim_idx], []), + method="Powell", + args=(set_x, bboxes, base_hits, optim_idx), + ) -def pretty_str(x:list): +def pretty_str(x: list): if isinstance(x, list): - return '[' + ', '.join([pretty_str(y) for y in x]) + ']' + return "[" + ", ".join([pretty_str(y) for y in x]) + "]" elif isinstance(x, np.ndarray): return pretty_str(list(x)) else: - return '%.2f' % x + return "%.2f" % x + + +if __name__ == "__main__": -if __name__ == '__main__': - if use_augmented_boxes: - with open(aug_file, 'rb') as f: + with open(aug_file, "rb") as f: bboxes = pickle.load(f) else: # Load widths and heights from a dump file. Obtain this with # python3 scripts/save_bboxes.py - with open(dump_file, 'rb') as f: + with open(dump_file, "rb") as f: bboxes = pickle.load(f) bboxes = np.array(bboxes) @@ -178,27 +216,25 @@ def pretty_str(x:list): with torch.no_grad(): bboxes = torch.Tensor(bboxes).cuda() - + def print_out(): if optimize_scales: - print('Scales: ' + pretty_str(scales)) + print("Scales: " + pretty_str(scales)) else: - print('Aspect Ratios: ' + pretty_str(aspect_ratios)) + print("Aspect Ratios: " + pretty_str(aspect_ratios)) for p in range(10): - print('(Sub Iteration) ', end='') + print("(Sub Iteration) ", end="") for i in range(len(conv_sizes)): - print('%d ' % i, end='', flush=True) + print("%d " % i, end="", flush=True) optimize(bboxes, i) - print('Done', end='\r') - - print('(Iteration %d) ' % p, end='') + print("Done", end="\r") + + print("(Iteration %d) " % p, end="") print_out() print() optimize_scales = not optimize_scales - - print('scales = ' + pretty_str(scales)) - print('aspect_ratios = ' + pretty_str(aspect_ratios)) - + print("scales = " + pretty_str(scales)) + print("aspect_ratios = " + pretty_str(aspect_ratios)) diff --git a/scripts/parse_eval.py b/scripts/parse_eval.py index 153015655..ecb27ad71 100644 --- a/scripts/parse_eval.py +++ b/scripts/parse_eval.py @@ -2,48 +2,48 @@ import matplotlib.pyplot as plt from matplotlib._color_data import XKCD_COLORS -with open(sys.argv[1], 'r') as f: - txt = f.read() +with open(sys.argv[1], "r") as f: + txt = f.read() -txt, overall = txt.split('overall performance') +txt, overall = txt.split("overall performance") class_names = [] mAP_overall = [] -mAP_small = [] -mAP_medium = [] -mAP_large = [] - -for class_result in txt.split('evaluate category: ')[1:]: - lines = class_result.split('\n') - class_names.append(lines[0]) - - def grabMAP(string): - return float(string.split('] = ')[1]) * 100 - - mAP_overall.append(grabMAP(lines[ 7])) - mAP_small .append(grabMAP(lines[10])) - mAP_medium .append(grabMAP(lines[11])) - mAP_large .append(grabMAP(lines[12])) +mAP_small = [] +mAP_medium = [] +mAP_large = [] + +for class_result in txt.split("evaluate category: ")[1:]: + lines = class_result.split("\n") + class_names.append(lines[0]) + + def grabMAP(string): + return float(string.split("] = ")[1]) * 100 + + mAP_overall.append(grabMAP(lines[7])) + mAP_small.append(grabMAP(lines[10])) + mAP_medium.append(grabMAP(lines[11])) + mAP_large.append(grabMAP(lines[12])) mAP_map = { - 'small': mAP_small, - 'medium': mAP_medium, - 'large': mAP_large, + "small": mAP_small, + "medium": mAP_medium, + "large": mAP_large, } if len(sys.argv) > 2: - bars = plt.bar(class_names, mAP_map[sys.argv[2]]) - plt.title(sys.argv[2] + ' mAP per class') + bars = plt.bar(class_names, mAP_map[sys.argv[2]]) + plt.title(sys.argv[2] + " mAP per class") else: - bars = plt.bar(class_names, mAP_overall) - plt.title('overall mAP per class') + bars = plt.bar(class_names, mAP_overall) + plt.title("overall mAP per class") colors = list(XKCD_COLORS.values()) for idx, bar in enumerate(bars): - # Mmm pseudorandom colors - char_sum = sum([ord(char) for char in class_names[idx]]) - bar.set_color(colors[char_sum % len(colors)]) + # Mmm pseudorandom colors + char_sum = sum([ord(char) for char in class_names[idx]]) + bar.set_color(colors[char_sum % len(colors)]) -plt.xticks(rotation='vertical') +plt.xticks(rotation="vertical") plt.show() diff --git a/scripts/plot_loss.py b/scripts/plot_loss.py index 6ebb56dd9..df95bec61 100644 --- a/scripts/plot_loss.py +++ b/scripts/plot_loss.py @@ -3,76 +3,81 @@ from utils.functions import MovingAverage -with open(sys.argv[1], 'r') as f: - inp = f.read() +with open(sys.argv[1], "r") as f: + inp = f.read() patterns = { - 'train': re.compile(r'\[\s*(?P\d+)\]\s*(?P\d+) \|\| B: (?P\S+) \| C: (?P\S+) \| M: (?P\S+) \|( S: (?P\S+) \|)? T: (?P\S+)'), - 'val': re.compile(r'\s*(?P[a-z]+) \|\s*(?P\S+)') + "train": re.compile( + r"\[\s*(?P\d+)\]\s*(?P\d+) \|\| B: (?P\S+) \| C: (?P\S+) \| M: (?P\S+) \|( S: (?P\S+) \|)? T: (?P\S+)" + ), + "val": re.compile(r"\s*(?P[a-z]+) \|\s*(?P\S+)"), } data = {key: [] for key in patterns} -for line in inp.split('\n'): - for key, pattern in patterns.items(): - f = pattern.search(line) - - if f is not None: - datum = f.groupdict() - for k, v in datum.items(): - if v is not None: - try: - v = float(v) - except ValueError: - pass - datum[k] = v - - if key == 'val': - datum = (datum, data['train'][-1]) - data[key].append(datum) - break +for line in inp.split("\n"): + for key, pattern in patterns.items(): + f = pattern.search(line) + + if f is not None: + datum = f.groupdict() + for k, v in datum.items(): + if v is not None: + try: + v = float(v) + except ValueError: + pass + datum[k] = v + + if key == "val": + datum = (datum, data["train"][-1]) + data[key].append(datum) + break def smoother(y, interval=100): - avg = MovingAverage(interval) + avg = MovingAverage(interval) + + for i in range(len(y)): + avg.append(y[i]) + y[i] = avg.get_avg() + + return y - for i in range(len(y)): - avg.append(y[i]) - y[i] = avg.get_avg() - - return y def plot_train(data): - plt.title(os.path.basename(sys.argv[1]) + ' Training Loss') - plt.xlabel('Iteration') - plt.ylabel('Loss') + plt.title(os.path.basename(sys.argv[1]) + " Training Loss") + plt.xlabel("Iteration") + plt.ylabel("Loss") + + loss_names = ["BBox Loss", "Conf Loss", "Mask Loss"] - loss_names = ['BBox Loss', 'Conf Loss', 'Mask Loss'] + x = [x["iteration"] for x in data] + plt.plot(x, smoother([y["b"] for y in data])) + plt.plot(x, smoother([y["c"] for y in data])) + plt.plot(x, smoother([y["m"] for y in data])) - x = [x['iteration'] for x in data] - plt.plot(x, smoother([y['b'] for y in data])) - plt.plot(x, smoother([y['c'] for y in data])) - plt.plot(x, smoother([y['m'] for y in data])) + if data[0]["s"] is not None: + plt.plot(x, smoother([y["s"] for y in data])) + loss_names.append("Segmentation Loss") - if data[0]['s'] is not None: - plt.plot(x, smoother([y['s'] for y in data])) - loss_names.append('Segmentation Loss') + plt.legend(loss_names) + plt.show() - plt.legend(loss_names) - plt.show() def plot_val(data): - plt.title(os.path.basename(sys.argv[1]) + ' Validation mAP') - plt.xlabel('Epoch') - plt.ylabel('mAP') + plt.title(os.path.basename(sys.argv[1]) + " Validation mAP") + plt.xlabel("Epoch") + plt.ylabel("mAP") + + x = [x[1]["epoch"] for x in data if x[0]["type"] == "box"] + plt.plot(x, [x[0]["all"] for x in data if x[0]["type"] == "box"]) + plt.plot(x, [x[0]["all"] for x in data if x[0]["type"] == "mask"]) - x = [x[1]['epoch'] for x in data if x[0]['type'] == 'box'] - plt.plot(x, [x[0]['all'] for x in data if x[0]['type'] == 'box']) - plt.plot(x, [x[0]['all'] for x in data if x[0]['type'] == 'mask']) + plt.legend(["BBox mAP", "Mask mAP"]) + plt.show() - plt.legend(['BBox mAP', 'Mask mAP']) - plt.show() -if len(sys.argv) > 2 and sys.argv[2] == 'val': - plot_val(data['val']) +if len(sys.argv) > 2 and sys.argv[2] == "val": + plot_val(data["val"]) else: - plot_train(data['train']) + plot_train(data["train"]) diff --git a/scripts/save_bboxes.py b/scripts/save_bboxes.py index 63dedb4d5..8baa78c3a 100644 --- a/scripts/save_bboxes.py +++ b/scripts/save_bboxes.py @@ -7,27 +7,27 @@ import numpy as np -COCO_ROOT = osp.join('.', 'data/coco/') +COCO_ROOT = osp.join(".", "data/coco/") -annotation_file = 'instances_train2017.json' -annotation_path = osp.join(COCO_ROOT, 'annotations/', annotation_file) +annotation_file = "instances_train2017.json" +annotation_path = osp.join(COCO_ROOT, "annotations/", annotation_file) -dump_file = 'weights/bboxes.pkl' +dump_file = "weights/bboxes.pkl" -with open(annotation_path, 'r') as f: - annotations_json = json.load(f) +with open(annotation_path, "r") as f: + annotations_json = json.load(f) -annotations = annotations_json['annotations'] -images = annotations_json['images'] -images = {image['id']: image for image in images} +annotations = annotations_json["annotations"] +images = annotations_json["images"] +images = {image["id"]: image for image in images} bboxes = [] for ann in annotations: - image = images[ann['image_id']] - w,h = (image['width'], image['height']) - - if 'bbox' in ann: - bboxes.append([w, h] + ann['bbox']) - -with open(dump_file, 'wb') as f: - pickle.dump(bboxes, f) + image = images[ann["image_id"]] + w, h = (image["width"], image["height"]) + + if "bbox" in ann: + bboxes.append([w, h] + ann["bbox"]) + +with open(dump_file, "wb") as f: + pickle.dump(bboxes, f) diff --git a/scripts/unpack_statedict.py b/scripts/unpack_statedict.py index d26d03bc6..011e18eb0 100644 --- a/scripts/unpack_statedict.py +++ b/scripts/unpack_statedict.py @@ -5,12 +5,12 @@ # Make sure to include that slash after your out folder, since I can't # be arsed to do path concatenation so I'd rather type out this comment -print('Loading state dict...') +print("Loading state dict...") state = torch.load(sys.argv[1]) if not os.path.exists(sys.argv[2]): - os.mkdir(sys.argv[2]) + os.mkdir(sys.argv[2]) -print('Saving stuff...') +print("Saving stuff...") for key, val in state.items(): - torch.save(val, sys.argv[2] + key) + torch.save(val, sys.argv[2] + key) diff --git a/train.py b/train.py index 5f49fbc72..8a2e82a82 100644 --- a/train.py +++ b/train.py @@ -24,62 +24,134 @@ # Oof import eval as eval_script + def str2bool(v): return v.lower() in ("yes", "true", "t", "1") -parser = argparse.ArgumentParser( - description='Yolact Training Script') -parser.add_argument('--batch_size', default=8, type=int, - help='Batch size for training') -parser.add_argument('--resume', default=None, type=str, - help='Checkpoint state_dict file to resume training from. If this is "interrupt"'\ - ', the model will resume training from the interrupt file.') -parser.add_argument('--start_iter', default=-1, type=int, - help='Resume training at this iter. If this is -1, the iteration will be'\ - 'determined from the file name.') -parser.add_argument('--num_workers', default=4, type=int, - help='Number of workers used in dataloading') -parser.add_argument('--cuda', default=True, type=str2bool, - help='Use CUDA to train model') -parser.add_argument('--lr', '--learning_rate', default=None, type=float, - help='Initial learning rate. Leave as None to read this from the config.') -parser.add_argument('--momentum', default=None, type=float, - help='Momentum for SGD. Leave as None to read this from the config.') -parser.add_argument('--decay', '--weight_decay', default=None, type=float, - help='Weight decay for SGD. Leave as None to read this from the config.') -parser.add_argument('--gamma', default=None, type=float, - help='For each lr step, what to multiply the lr by. Leave as None to read this from the config.') -parser.add_argument('--save_folder', default='weights/', - help='Directory for saving checkpoint models.') -parser.add_argument('--log_folder', default='logs/', - help='Directory for saving logs.') -parser.add_argument('--config', default=None, - help='The config object to use.') -parser.add_argument('--save_interval', default=10000, type=int, - help='The number of iterations between saving the model.') -parser.add_argument('--validation_size', default=5000, type=int, - help='The number of images to use for validation.') -parser.add_argument('--validation_epoch', default=2, type=int, - help='Output validation information every n iterations. If -1, do no validation.') -parser.add_argument('--keep_latest', dest='keep_latest', action='store_true', - help='Only keep the latest checkpoint instead of each one.') -parser.add_argument('--keep_latest_interval', default=100000, type=int, - help='When --keep_latest is on, don\'t delete the latest file at these intervals. This should be a multiple of save_interval or 0.') -parser.add_argument('--dataset', default=None, type=str, - help='If specified, override the dataset specified in the config with this one (example: coco2017_dataset).') -parser.add_argument('--no_log', dest='log', action='store_false', - help='Don\'t log per iteration information into log_folder.') -parser.add_argument('--log_gpu', dest='log_gpu', action='store_true', - help='Include GPU information in the logs. Nvidia-smi tends to be slow, so set this with caution.') -parser.add_argument('--no_interrupt', dest='interrupt', action='store_false', - help='Don\'t save an interrupt when KeyboardInterrupt is caught.') -parser.add_argument('--batch_alloc', default=None, type=str, - help='If using multiple GPUS, you can set this to be a comma separated list detailing which GPUs should get what local batch size (It should add up to your total batch size).') -parser.add_argument('--no_autoscale', dest='autoscale', action='store_false', - help='YOLACT will automatically scale the lr and the number of iterations depending on the batch size. Set this if you want to disable that.') - -parser.set_defaults(keep_latest=False, log=True, log_gpu=False, interrupt=True, autoscale=True) +parser = argparse.ArgumentParser(description="Yolact Training Script") +parser.add_argument("--batch_size", default=8, type=int, help="Batch size for training") +parser.add_argument( + "--resume", + default=None, + type=str, + help='Checkpoint state_dict file to resume training from. If this is "interrupt"' + ", the model will resume training from the interrupt file.", +) +parser.add_argument( + "--start_iter", + default=-1, + type=int, + help="Resume training at this iter. If this is -1, the iteration will be" + "determined from the file name.", +) +parser.add_argument( + "--num_workers", default=4, type=int, help="Number of workers used in dataloading" +) +parser.add_argument( + "--cuda", default=True, type=str2bool, help="Use CUDA to train model" +) +parser.add_argument( + "--lr", + "--learning_rate", + default=None, + type=float, + help="Initial learning rate. Leave as None to read this from the config.", +) +parser.add_argument( + "--momentum", + default=None, + type=float, + help="Momentum for SGD. Leave as None to read this from the config.", +) +parser.add_argument( + "--decay", + "--weight_decay", + default=None, + type=float, + help="Weight decay for SGD. Leave as None to read this from the config.", +) +parser.add_argument( + "--gamma", + default=None, + type=float, + help="For each lr step, what to multiply the lr by. Leave as None to read this from the config.", +) +parser.add_argument( + "--save_folder", default="weights/", help="Directory for saving checkpoint models." +) +parser.add_argument("--log_folder", default="logs/", help="Directory for saving logs.") +parser.add_argument("--config", default=None, help="The config object to use.") +parser.add_argument( + "--save_interval", + default=10000, + type=int, + help="The number of iterations between saving the model.", +) +parser.add_argument( + "--validation_size", + default=5000, + type=int, + help="The number of images to use for validation.", +) +parser.add_argument( + "--validation_epoch", + default=2, + type=int, + help="Output validation information every n iterations. If -1, do no validation.", +) +parser.add_argument( + "--keep_latest", + dest="keep_latest", + action="store_true", + help="Only keep the latest checkpoint instead of each one.", +) +parser.add_argument( + "--keep_latest_interval", + default=100000, + type=int, + help="When --keep_latest is on, don't delete the latest file at these intervals. This should be a multiple of save_interval or 0.", +) +parser.add_argument( + "--dataset", + default=None, + type=str, + help="If specified, override the dataset specified in the config with this one (example: coco2017_dataset).", +) +parser.add_argument( + "--no_log", + dest="log", + action="store_false", + help="Don't log per iteration information into log_folder.", +) +parser.add_argument( + "--log_gpu", + dest="log_gpu", + action="store_true", + help="Include GPU information in the logs. Nvidia-smi tends to be slow, so set this with caution.", +) +parser.add_argument( + "--no_interrupt", + dest="interrupt", + action="store_false", + help="Don't save an interrupt when KeyboardInterrupt is caught.", +) +parser.add_argument( + "--batch_alloc", + default=None, + type=str, + help="If using multiple GPUS, you can set this to be a comma separated list detailing which GPUs should get what local batch size (It should add up to your total batch size).", +) +parser.add_argument( + "--no_autoscale", + dest="autoscale", + action="store_false", + help="YOLACT will automatically scale the lr and the number of iterations depending on the batch size. Set this if you want to disable that.", +) + +parser.set_defaults( + keep_latest=False, log=True, log_gpu=False, interrupt=True, autoscale=True +) args = parser.parse_args() if args.config is not None: @@ -90,8 +162,11 @@ def str2bool(v): if args.autoscale and args.batch_size != 8: factor = args.batch_size / 8 - if __name__ == '__main__': - print('Scaling parameters by %.2f to account for a batch size of %d.' % (factor, args.batch_size)) + if __name__ == "__main__": + print( + "Scaling parameters by %.2f to account for a batch size of %d." + % (factor, args.batch_size) + ) cfg.lr *= factor cfg.max_iter //= factor @@ -99,53 +174,62 @@ def str2bool(v): # Update training parameters from the config if necessary def replace(name): - if getattr(args, name) == None: setattr(args, name, getattr(cfg, name)) -replace('lr') -replace('decay') -replace('gamma') -replace('momentum') + if getattr(args, name) == None: + setattr(args, name, getattr(cfg, name)) + + +replace("lr") +replace("decay") +replace("gamma") +replace("momentum") # This is managed by set_lr cur_lr = args.lr if torch.cuda.device_count() == 0: - print('No GPUs detected. Exiting...') + print("No GPUs detected. Exiting...") exit(-1) if args.batch_size // torch.cuda.device_count() < 6: - if __name__ == '__main__': - print('Per-GPU batch size is less than the recommended limit for batch norm. Disabling batch norm.') + if __name__ == "__main__": + print( + "Per-GPU batch size is less than the recommended limit for batch norm. Disabling batch norm." + ) cfg.freeze_bn = True -loss_types = ['B', 'C', 'M', 'P', 'D', 'E', 'S', 'I'] +loss_types = ["B", "C", "M", "P", "D", "E", "S", "I"] if torch.cuda.is_available(): if args.cuda: - torch.set_default_tensor_type('torch.cuda.FloatTensor') + torch.set_default_tensor_type("torch.cuda.FloatTensor") if not args.cuda: - print("WARNING: It looks like you have a CUDA device, but aren't " + - "using CUDA.\nRun with --cuda for optimal training speed.") - torch.set_default_tensor_type('torch.FloatTensor') + print( + "WARNING: It looks like you have a CUDA device, but aren't " + + "using CUDA.\nRun with --cuda for optimal training speed." + ) + torch.set_default_tensor_type("torch.FloatTensor") else: - torch.set_default_tensor_type('torch.FloatTensor') + torch.set_default_tensor_type("torch.FloatTensor") + class NetLoss(nn.Module): """ A wrapper for running the network and computing the loss This is so we can more efficiently use DataParallel. """ - - def __init__(self, net:Yolact, criterion:MultiBoxLoss): + + def __init__(self, net: Yolact, criterion: MultiBoxLoss): super().__init__() self.net = net self.criterion = criterion - + def forward(self, images, targets, masks, num_crowds): preds = self.net(images) losses = self.criterion(self.net, preds, targets, masks, num_crowds) return losses + class CustomDataParallel(nn.DataParallel): """ This is a custom version of DataParallel that works better with our training data. @@ -155,33 +239,43 @@ class CustomDataParallel(nn.DataParallel): def scatter(self, inputs, kwargs, device_ids): # More like scatter and data prep at the same time. The point is we prep the data in such a way # that no scatter is necessary, and there's no need to shuffle stuff around different GPUs. - devices = ['cuda:' + str(x) for x in device_ids] + devices = ["cuda:" + str(x) for x in device_ids] splits = prepare_data(inputs[0], devices, allocation=args.batch_alloc) - return [[split[device_idx] for split in splits] for device_idx in range(len(devices))], \ - [kwargs] * len(devices) + return ( + [ + [split[device_idx] for split in splits] + for device_idx in range(len(devices)) + ], + [kwargs] * len(devices), + ) def gather(self, outputs, output_device): out = {} for k in outputs[0]: out[k] = torch.stack([output[k].to(output_device) for output in outputs]) - + return out + def train(): if not os.path.exists(args.save_folder): os.mkdir(args.save_folder) - dataset = COCODetection(image_path=cfg.dataset.train_images, - info_file=cfg.dataset.train_info, - transform=SSDAugmentation(MEANS)) - + dataset = COCODetection( + image_path=cfg.dataset.train_images, + info_file=cfg.dataset.train_info, + transform=SSDAugmentation(MEANS), + ) + if args.validation_epoch > 0: setup_eval() - val_dataset = COCODetection(image_path=cfg.dataset.valid_images, - info_file=cfg.dataset.valid_info, - transform=BaseTransform(MEANS)) + val_dataset = COCODetection( + image_path=cfg.dataset.valid_images, + info_file=cfg.dataset.valid_info, + transform=BaseTransform(MEANS), + ) # Parallel wraps the underlying module, but when saving and loading we don't want that yolact_net = Yolact() @@ -189,50 +283,63 @@ def train(): net.train() if args.log: - log = Log(cfg.name, args.log_folder, dict(args._get_kwargs()), - overwrite=(args.resume is None), log_gpu_stats=args.log_gpu) + log = Log( + cfg.name, + args.log_folder, + dict(args._get_kwargs()), + overwrite=(args.resume is None), + log_gpu_stats=args.log_gpu, + ) # I don't use the timer during training (I use a different timing method). # Apparently there's a race condition with multiple GPUs, so disable it just to be safe. timer.disable_all() - # Both of these can set args.resume to None, so do them before the check - if args.resume == 'interrupt': + # Both of these can set args.resume to None, so do them before the check + if args.resume == "interrupt": args.resume = SavePath.get_interrupt(args.save_folder) - elif args.resume == 'latest': + elif args.resume == "latest": args.resume = SavePath.get_latest(args.save_folder, cfg.name) if args.resume is not None: - print('Resuming training, loading {}...'.format(args.resume)) + print("Resuming training, loading {}...".format(args.resume)) yolact_net.load_weights(args.resume) if args.start_iter == -1: args.start_iter = SavePath.from_str(args.resume).iteration else: - print('Initializing weights...') + print("Initializing weights...") yolact_net.init_weights(backbone_path=args.save_folder + cfg.backbone.path) - optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, - weight_decay=args.decay) - criterion = MultiBoxLoss(num_classes=cfg.num_classes, - pos_threshold=cfg.positive_iou_threshold, - neg_threshold=cfg.negative_iou_threshold, - negpos_ratio=cfg.ohem_negpos_ratio) + optimizer = optim.SGD( + net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.decay + ) + criterion = MultiBoxLoss( + num_classes=cfg.num_classes, + pos_threshold=cfg.positive_iou_threshold, + neg_threshold=cfg.negative_iou_threshold, + negpos_ratio=cfg.ohem_negpos_ratio, + ) if args.batch_alloc is not None: - args.batch_alloc = [int(x) for x in args.batch_alloc.split(',')] + args.batch_alloc = [int(x) for x in args.batch_alloc.split(",")] if sum(args.batch_alloc) != args.batch_size: - print('Error: Batch allocation (%s) does not sum to batch size (%s).' % (args.batch_alloc, args.batch_size)) + print( + "Error: Batch allocation (%s) does not sum to batch size (%s)." + % (args.batch_alloc, args.batch_size) + ) exit(-1) net = CustomDataParallel(NetLoss(net, criterion)) if args.cuda: net = net.cuda() - + # Initialize everything - if not cfg.freeze_bn: yolact_net.freeze_bn() # Freeze bn so we don't kill our means + if not cfg.freeze_bn: + yolact_net.freeze_bn() # Freeze bn so we don't kill our means yolact_net(torch.zeros(1, 3, cfg.max_size, cfg.max_size).cuda()) - if not cfg.freeze_bn: yolact_net.freeze_bn(True) + if not cfg.freeze_bn: + yolact_net.freeze_bn(True) # loss counters loc_loss = 0 @@ -242,34 +349,39 @@ def train(): epoch_size = len(dataset) // args.batch_size num_epochs = math.ceil(cfg.max_iter / epoch_size) - + # Which learning rate adjustment step are we on? lr' = lr * gamma ^ step_index step_index = 0 - data_loader = data.DataLoader(dataset, args.batch_size, - num_workers=args.num_workers, - shuffle=True, collate_fn=detection_collate, - pin_memory=True) - - - save_path = lambda epoch, iteration: SavePath(cfg.name, epoch, iteration).get_path(root=args.save_folder) + data_loader = data.DataLoader( + dataset, + args.batch_size, + num_workers=args.num_workers, + shuffle=True, + collate_fn=detection_collate, + pin_memory=True, + ) + + save_path = lambda epoch, iteration: SavePath(cfg.name, epoch, iteration).get_path( + root=args.save_folder + ) time_avg = MovingAverage() - global loss_types # Forms the print order - loss_avgs = { k: MovingAverage(100) for k in loss_types } + global loss_types # Forms the print order + loss_avgs = {k: MovingAverage(100) for k in loss_types} - print('Begin training!') + print("Begin training!") print() # try-except so you can use ctrl+c to save early and stop training try: for epoch in range(num_epochs): # Resume from start_iter - if (epoch+1)*epoch_size < iteration: + if (epoch + 1) * epoch_size < iteration: continue - + for datum in data_loader: # Stop if we've reached an epoch if we're resuming from start_iter - if iteration == (epoch+1)*epoch_size: + if iteration == (epoch + 1) * epoch_size: break # Stop at the configured number of iterations even if mid-epoch @@ -286,43 +398,55 @@ def train(): # Reset the loss averages because things might have changed for avg in loss_avgs: avg.reset() - + # If a config setting was changed, remove it from the list so we don't keep checking if changed: - cfg.delayed_settings = [x for x in cfg.delayed_settings if x[0] > iteration] + cfg.delayed_settings = [ + x for x in cfg.delayed_settings if x[0] > iteration + ] # Warm up by linearly interpolating the learning rate from some smaller value if cfg.lr_warmup_until > 0 and iteration <= cfg.lr_warmup_until: - set_lr(optimizer, (args.lr - cfg.lr_warmup_init) * (iteration / cfg.lr_warmup_until) + cfg.lr_warmup_init) + set_lr( + optimizer, + (args.lr - cfg.lr_warmup_init) + * (iteration / cfg.lr_warmup_until) + + cfg.lr_warmup_init, + ) # Adjust the learning rate at the given iterations, but also if we resume from past that iteration - while step_index < len(cfg.lr_steps) and iteration >= cfg.lr_steps[step_index]: + while ( + step_index < len(cfg.lr_steps) + and iteration >= cfg.lr_steps[step_index] + ): step_index += 1 set_lr(optimizer, args.lr * (args.gamma ** step_index)) - + # Zero the grad to get ready to compute gradients optimizer.zero_grad() # Forward Pass + Compute loss at the same time (see CustomDataParallel and NetLoss) losses = net(datum) - - losses = { k: (v).mean() for k,v in losses.items() } # Mean here because Dataparallel + + losses = { + k: (v).mean() for k, v in losses.items() + } # Mean here because Dataparallel loss = sum([losses[k] for k in losses]) - + # no_inf_mean removes some components from the loss, so make sure to backward through all of it # all_loss = sum([v.mean() for v in losses.values()]) # Backprop - loss.backward() # Do this to free up vram even if loss is not finite + loss.backward() # Do this to free up vram even if loss is not finite if torch.isfinite(loss).item(): optimizer.step() - + # Add the loss to the moving average for bookkeeping for k in losses: loss_avgs[k].add(losses[k].item()) - cur_time = time.time() - elapsed = cur_time - last_time + cur_time = time.time() + elapsed = cur_time - last_time last_time = cur_time # Exclude graph setup from the timing information @@ -330,56 +454,94 @@ def train(): time_avg.add(elapsed) if iteration % 10 == 0: - eta_str = str(datetime.timedelta(seconds=(cfg.max_iter-iteration) * time_avg.get_avg())).split('.')[0] - + eta_str = str( + datetime.timedelta( + seconds=(cfg.max_iter - iteration) * time_avg.get_avg() + ) + ).split(".")[0] + total = sum([loss_avgs[k].get_avg() for k in losses]) - loss_labels = sum([[k, loss_avgs[k].get_avg()] for k in loss_types if k in losses], []) - - print(('[%3d] %7d ||' + (' %s: %.3f |' * len(losses)) + ' T: %.3f || ETA: %s || timer: %.3f') - % tuple([epoch, iteration] + loss_labels + [total, eta_str, elapsed]), flush=True) + loss_labels = sum( + [ + [k, loss_avgs[k].get_avg()] + for k in loss_types + if k in losses + ], + [], + ) + + print( + ( + "[%3d] %7d ||" + + (" %s: %.3f |" * len(losses)) + + " T: %.3f || ETA: %s || timer: %.3f" + ) + % tuple( + [epoch, iteration] + loss_labels + [total, eta_str, elapsed] + ), + flush=True, + ) if args.log: precision = 5 loss_info = {k: round(losses[k].item(), precision) for k in losses} - loss_info['T'] = round(loss.item(), precision) + loss_info["T"] = round(loss.item(), precision) if args.log_gpu: - log.log_gpu_stats = (iteration % 10 == 0) # nvidia-smi is sloooow - - log.log('train', loss=loss_info, epoch=epoch, iter=iteration, - lr=round(cur_lr, 10), elapsed=elapsed) + log.log_gpu_stats = iteration % 10 == 0 # nvidia-smi is sloooow + + log.log( + "train", + loss=loss_info, + epoch=epoch, + iter=iteration, + lr=round(cur_lr, 10), + elapsed=elapsed, + ) log.log_gpu_stats = args.log_gpu - + iteration += 1 if iteration % args.save_interval == 0 and iteration != args.start_iter: if args.keep_latest: latest = SavePath.get_latest(args.save_folder, cfg.name) - print('Saving state, iter:', iteration) + print("Saving state, iter:", iteration) yolact_net.save_weights(save_path(epoch, iteration)) if args.keep_latest and latest is not None: - if args.keep_latest_interval <= 0 or iteration % args.keep_latest_interval != args.save_interval: - print('Deleting old save...') + if ( + args.keep_latest_interval <= 0 + or iteration % args.keep_latest_interval + != args.save_interval + ): + print("Deleting old save...") os.remove(latest) - + # This is done per epoch if args.validation_epoch > 0: if epoch % args.validation_epoch == 0 and epoch > 0: - compute_validation_map(epoch, iteration, yolact_net, val_dataset, log if args.log else None) - + compute_validation_map( + epoch, + iteration, + yolact_net, + val_dataset, + log if args.log else None, + ) + # Compute validation mAP after training is finished - compute_validation_map(epoch, iteration, yolact_net, val_dataset, log if args.log else None) + compute_validation_map( + epoch, iteration, yolact_net, val_dataset, log if args.log else None + ) except KeyboardInterrupt: if args.interrupt: - print('Stopping early. Saving network...') - + print("Stopping early. Saving network...") + # Delete previous copy of the interrupted network so we don't spam the weights folder SavePath.remove_interrupt(args.save_folder) - - yolact_net.save_weights(save_path(epoch, repr(iteration) + '_interrupt')) + + yolact_net.save_weights(save_path(epoch, repr(iteration) + "_interrupt")) exit() yolact_net.save_weights(save_path(epoch, iteration)) @@ -387,56 +549,67 @@ def train(): def set_lr(optimizer, new_lr): for param_group in optimizer.param_groups: - param_group['lr'] = new_lr - + param_group["lr"] = new_lr + global cur_lr cur_lr = new_lr + def gradinator(x): x.requires_grad = False return x -def prepare_data(datum, devices:list=None, allocation:list=None): + +def prepare_data(datum, devices: list = None, allocation: list = None): with torch.no_grad(): if devices is None: - devices = ['cuda:0'] if args.cuda else ['cpu'] + devices = ["cuda:0"] if args.cuda else ["cpu"] if allocation is None: allocation = [args.batch_size // len(devices)] * (len(devices) - 1) - allocation.append(args.batch_size - sum(allocation)) # The rest might need more/less - + allocation.append( + args.batch_size - sum(allocation) + ) # The rest might need more/less + images, (targets, masks, num_crowds) = datum cur_idx = 0 for device, alloc in zip(devices, allocation): for _ in range(alloc): - images[cur_idx] = gradinator(images[cur_idx].to(device)) + images[cur_idx] = gradinator(images[cur_idx].to(device)) targets[cur_idx] = gradinator(targets[cur_idx].to(device)) - masks[cur_idx] = gradinator(masks[cur_idx].to(device)) + masks[cur_idx] = gradinator(masks[cur_idx].to(device)) cur_idx += 1 if cfg.preserve_aspect_ratio: # Choose a random size from the batch - _, h, w = images[random.randint(0, len(images)-1)].size() + _, h, w = images[random.randint(0, len(images) - 1)].size() + + for idx, (image, target, mask, num_crowd) in enumerate( + zip(images, targets, masks, num_crowds) + ): + images[idx], targets[idx], masks[idx], num_crowds[idx] = enforce_size( + image, target, mask, num_crowd, w, h + ) - for idx, (image, target, mask, num_crowd) in enumerate(zip(images, targets, masks, num_crowds)): - images[idx], targets[idx], masks[idx], num_crowds[idx] \ - = enforce_size(image, target, mask, num_crowd, w, h) - cur_idx = 0 - split_images, split_targets, split_masks, split_numcrowds \ - = [[None for alloc in allocation] for _ in range(4)] + split_images, split_targets, split_masks, split_numcrowds = [ + [None for alloc in allocation] for _ in range(4) + ] for device_idx, alloc in enumerate(allocation): - split_images[device_idx] = torch.stack(images[cur_idx:cur_idx+alloc], dim=0) - split_targets[device_idx] = targets[cur_idx:cur_idx+alloc] - split_masks[device_idx] = masks[cur_idx:cur_idx+alloc] - split_numcrowds[device_idx] = num_crowds[cur_idx:cur_idx+alloc] + split_images[device_idx] = torch.stack( + images[cur_idx : cur_idx + alloc], dim=0 + ) + split_targets[device_idx] = targets[cur_idx : cur_idx + alloc] + split_masks[device_idx] = masks[cur_idx : cur_idx + alloc] + split_numcrowds[device_idx] = num_crowds[cur_idx : cur_idx + alloc] cur_idx += alloc return split_images, split_targets, split_masks, split_numcrowds -def no_inf_mean(x:torch.Tensor): + +def no_inf_mean(x: torch.Tensor): """ Computes the mean of a vector, throwing out all inf values. If there are no non-inf values, this will return inf (i.e., just the normal mean). @@ -449,12 +622,13 @@ def no_inf_mean(x:torch.Tensor): else: return x.mean() + def compute_validation_loss(net, data_loader, criterion): global loss_types with torch.no_grad(): losses = {} - + # Don't switch to eval mode because we want to get losses iterations = 0 for datum in data_loader: @@ -463,7 +637,7 @@ def compute_validation_loss(net, data_loader, criterion): wrapper = ScatterWrapper(targets, masks, num_crowds) _losses = criterion(out, wrapper, wrapper.make_mask()) - + for k, v in _losses.items(): v = v.mean().item() if k in losses: @@ -474,18 +648,22 @@ def compute_validation_loss(net, data_loader, criterion): iterations += 1 if args.validation_size <= iterations * args.batch_size: break - + for k in losses: losses[k] /= iterations - - + loss_labels = sum([[k, losses[k]] for k in loss_types if k in losses], []) - print(('Validation ||' + (' %s: %.3f |' * len(losses)) + ')') % tuple(loss_labels), flush=True) + print( + ("Validation ||" + (" %s: %.3f |" * len(losses)) + ")") + % tuple(loss_labels), + flush=True, + ) + -def compute_validation_map(epoch, iteration, yolact_net, dataset, log:Log=None): +def compute_validation_map(epoch, iteration, yolact_net, dataset, log: Log = None): with torch.no_grad(): yolact_net.eval() - + start = time.time() print() print("Computing validation mAP (this may take a while)...", flush=True) @@ -493,12 +671,14 @@ def compute_validation_map(epoch, iteration, yolact_net, dataset, log:Log=None): end = time.time() if log is not None: - log.log('val', val_info, elapsed=(end - start), epoch=epoch, iter=iteration) + log.log("val", val_info, elapsed=(end - start), epoch=epoch, iter=iteration) yolact_net.train() + def setup_eval(): - eval_script.parse_args(['--no_bar', '--max_images='+str(args.validation_size)]) + eval_script.parse_args(["--no_bar", "--max_images=" + str(args.validation_size)]) + -if __name__ == '__main__': +if __name__ == "__main__": train() diff --git a/utils/__init__.py b/utils/__init__.py index a7ecb871a..abead4ba0 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -1 +1 @@ -from .augmentations import SSDAugmentation \ No newline at end of file +from .augmentations import SSDAugmentation diff --git a/utils/augmentations.py b/utils/augmentations.py index cc7a73aa3..3fa193d4e 100644 --- a/utils/augmentations.py +++ b/utils/augmentations.py @@ -28,10 +28,8 @@ def jaccard_numpy(box_a, box_b): jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]] """ inter = intersect(box_a, box_b) - area_a = ((box_a[:, 2]-box_a[:, 0]) * - (box_a[:, 3]-box_a[:, 1])) # [A,B] - area_b = ((box_b[2]-box_b[0]) * - (box_b[3]-box_b[1])) # [A,B] + area_a = (box_a[:, 2] - box_a[:, 0]) * (box_a[:, 3] - box_a[:, 1]) # [A,B] + area_b = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]) # [A,B] union = area_a + area_b - inter return inter / union # [A,B] @@ -72,7 +70,6 @@ def __call__(self, image, masks=None, boxes=None, labels=None): return image.astype(np.float32), masks, boxes, labels - class ToAbsoluteCoords(object): def __call__(self, image, masks=None, boxes=None, labels=None): height, width, channels = image.shape @@ -102,6 +99,7 @@ class Pad(object): Note: this expects im_w <= width and im_h <= height """ + def __init__(self, width, height, mean=MEANS, pad_gt=True): self.mean = mean self.width = width @@ -111,21 +109,20 @@ def __init__(self, width, height, mean=MEANS, pad_gt=True): def __call__(self, image, masks, boxes=None, labels=None): im_h, im_w, depth = image.shape - expand_image = np.zeros( - (self.height, self.width, depth), - dtype=image.dtype) + expand_image = np.zeros((self.height, self.width, depth), dtype=image.dtype) expand_image[:, :, :] = self.mean expand_image[:im_h, :im_w] = image if self.pad_gt: expand_masks = np.zeros( - (masks.shape[0], self.height, self.width), - dtype=masks.dtype) - expand_masks[:,:im_h,:im_w] = masks + (masks.shape[0], self.height, self.width), dtype=masks.dtype + ) + expand_masks[:, :im_h, :im_w] = masks masks = expand_masks return expand_image, masks, boxes, labels + class Resize(object): """ If preserve_aspect_ratio is true, this resizes to an approximate area of max_size * max_size """ @@ -144,7 +141,7 @@ def __init__(self, resize_gt=True): def __call__(self, image, masks, boxes, labels=None): img_h, img_w, _ = image.shape - + if self.preserve_aspect_ratio: width, height = Resize.calc_size_preserve_ar(img_w, img_h, self.max_size) else: @@ -156,7 +153,7 @@ def __call__(self, image, masks, boxes, labels=None): # Act like each object is a color channel masks = masks.transpose((1, 2, 0)) masks = cv2.resize(masks, (width, height)) - + # OpenCV resizes a (w,h,1) array to (s,s), so fix that if len(masks.shape) == 2: masks = np.expand_dims(masks, 0) @@ -164,8 +161,8 @@ def __call__(self, image, masks, boxes, labels=None): masks = masks.transpose((2, 0, 1)) # Scale bounding boxes (which are currently absolute coordinates) - boxes[:, [0, 2]] *= (width / img_w) - boxes[:, [1, 3]] *= (height / img_h) + boxes[:, [0, 2]] *= width / img_w + boxes[:, [1, 3]] *= height / img_h # Discard boxes that are smaller than we'd like w = boxes[:, 2] - boxes[:, 0] @@ -174,8 +171,8 @@ def __call__(self, image, masks, boxes, labels=None): keep = (w > cfg.discard_box_width) * (h > cfg.discard_box_height) masks = masks[keep] boxes = boxes[keep] - labels['labels'] = labels['labels'][keep] - labels['num_crowds'] = (labels['labels'] < 0).sum() + labels["labels"] = labels["labels"][keep] + labels["num_crowds"] = (labels["labels"] < 0).sum() return image, masks, boxes, labels @@ -209,9 +206,7 @@ def __call__(self, image, masks=None, boxes=None, labels=None): class RandomLightingNoise(object): def __init__(self): - self.perms = ((0, 1, 2), (0, 2, 1), - (1, 0, 2), (1, 2, 0), - (2, 0, 1), (2, 1, 0)) + self.perms = ((0, 1, 2), (0, 2, 1), (1, 0, 2), (1, 2, 0), (2, 0, 1), (2, 1, 0)) def __call__(self, image, masks=None, boxes=None, labels=None): # Don't shuffle the channels please, why would you do this @@ -224,14 +219,14 @@ def __call__(self, image, masks=None, boxes=None, labels=None): class ConvertColor(object): - def __init__(self, current='BGR', transform='HSV'): + def __init__(self, current="BGR", transform="HSV"): self.transform = transform self.current = current def __call__(self, image, masks=None, boxes=None, labels=None): - if self.current == 'BGR' and self.transform == 'HSV': + if self.current == "BGR" and self.transform == "HSV": image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) - elif self.current == 'HSV' and self.transform == 'BGR': + elif self.current == "HSV" and self.transform == "BGR": image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) else: raise NotImplementedError @@ -268,12 +263,22 @@ def __call__(self, image, masks=None, boxes=None, labels=None): class ToCV2Image(object): def __call__(self, tensor, masks=None, boxes=None, labels=None): - return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), masks, boxes, labels + return ( + tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), + masks, + boxes, + labels, + ) class ToTensor(object): def __call__(self, cvimage, masks=None, boxes=None, labels=None): - return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), masks, boxes, labels + return ( + torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), + masks, + boxes, + labels, + ) class RandomSampleCrop(object): @@ -289,6 +294,7 @@ class RandomSampleCrop(object): boxes (Tensor): the adjusted bounding boxes in pt form labels (Tensor): the class labels for each bbox """ + def __init__(self): self.sample_options = ( # using entire original input image @@ -312,9 +318,9 @@ def __call__(self, image, masks, boxes=None, labels=None): min_iou, max_iou = mode if min_iou is None: - min_iou = float('-inf') + min_iou = float("-inf") if max_iou is None: - max_iou = float('inf') + max_iou = float("inf") # max trails (50) for _ in range(50): @@ -331,7 +337,7 @@ def __call__(self, image, masks, boxes=None, labels=None): top = random.uniform(height - h) # convert to integer rect x1,y1,x2,y2 - rect = np.array([int(left), int(top), int(left+w), int(top+h)]) + rect = np.array([int(left), int(top), int(left + w), int(top + h)]) # calculate IoU (jaccard overlap) b/t the cropped and gt boxes overlap = jaccard_numpy(boxes, rect) @@ -347,8 +353,7 @@ def __call__(self, image, masks, boxes=None, labels=None): continue # cut the crop from the image - current_image = current_image[rect[1]:rect[3], rect[0]:rect[2], - :] + current_image = current_image[rect[1] : rect[3], rect[0] : rect[2], :] # keep overlap with gt box IF center in sampled patch centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0 @@ -363,7 +368,7 @@ def __call__(self, image, masks, boxes=None, labels=None): mask = m1 * m2 # [0 ... 0 for num_gt and then 1 ... 1 for num_crowds] - num_crowds = labels['num_crowds'] + num_crowds = labels["num_crowds"] crowd_mask = np.zeros(mask.shape, dtype=np.int32) if num_crowds > 0: @@ -371,7 +376,7 @@ def __call__(self, image, masks, boxes=None, labels=None): # have any valid boxes? try again if not # Also make sure you have at least one regular gt - if not mask.any() or np.sum(1-crowd_mask[mask]) == 0: + if not mask.any() or np.sum(1 - crowd_mask[mask]) == 0: continue # take only the matching gt masks @@ -381,26 +386,24 @@ def __call__(self, image, masks, boxes=None, labels=None): current_boxes = boxes[mask, :].copy() # take only matching gt labels - labels['labels'] = labels['labels'][mask] + labels["labels"] = labels["labels"][mask] current_labels = labels # We now might have fewer crowd annotations if num_crowds > 0: - labels['num_crowds'] = np.sum(crowd_mask[mask]) + labels["num_crowds"] = np.sum(crowd_mask[mask]) # should we use the box left and top corner or the crop's - current_boxes[:, :2] = np.maximum(current_boxes[:, :2], - rect[:2]) + current_boxes[:, :2] = np.maximum(current_boxes[:, :2], rect[:2]) # adjust to crop (by substracting crop's left,top) current_boxes[:, :2] -= rect[:2] - current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:], - rect[2:]) + current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:], rect[2:]) # adjust to crop (by substracting crop's left,top) current_boxes[:, 2:] -= rect[:2] # crop the current masks to the same dimensions as the image - current_masks = current_masks[:, rect[1]:rect[3], rect[0]:rect[2]] + current_masks = current_masks[:, rect[1] : rect[3], rect[0] : rect[2]] return current_image, current_masks, current_boxes, current_labels @@ -415,22 +418,24 @@ def __call__(self, image, masks, boxes, labels): height, width, depth = image.shape ratio = random.uniform(1, 4) - left = random.uniform(0, width*ratio - width) - top = random.uniform(0, height*ratio - height) + left = random.uniform(0, width * ratio - width) + top = random.uniform(0, height * ratio - height) expand_image = np.zeros( - (int(height*ratio), int(width*ratio), depth), - dtype=image.dtype) + (int(height * ratio), int(width * ratio), depth), dtype=image.dtype + ) expand_image[:, :, :] = self.mean - expand_image[int(top):int(top + height), - int(left):int(left + width)] = image + expand_image[ + int(top) : int(top + height), int(left) : int(left + width) + ] = image image = expand_image expand_masks = np.zeros( - (masks.shape[0], int(height*ratio), int(width*ratio)), - dtype=masks.dtype) - expand_masks[:,int(top):int(top + height), - int(left):int(left + width)] = masks + (masks.shape[0], int(height * ratio), int(width * ratio)), dtype=masks.dtype + ) + expand_masks[ + :, int(top) : int(top + height), int(left) : int(left + width) + ] = masks masks = expand_masks boxes = boxes.copy() @@ -453,7 +458,7 @@ def __call__(self, image, masks, boxes, labels): class RandomFlip(object): def __call__(self, image, masks, boxes, labels): - height , _ , _ = image.shape + height, _, _ = image.shape if random.randint(2): image = image[::-1, :] masks = masks[:, ::-1, :] @@ -464,13 +469,18 @@ def __call__(self, image, masks, boxes, labels): class RandomRot90(object): def __call__(self, image, masks, boxes, labels): - old_height , old_width , _ = image.shape + old_height, old_width, _ = image.shape k = random.randint(4) - image = np.rot90(image,k) - masks = np.array([np.rot90(mask,k) for mask in masks]) + image = np.rot90(image, k) + masks = np.array([np.rot90(mask, k) for mask in masks]) boxes = boxes.copy() for _ in range(k): - boxes = np.array([[box[1], old_width - 1 - box[2], box[3], old_width - 1 - box[0]] for box in boxes]) + boxes = np.array( + [ + [box[1], old_width - 1 - box[2], box[3], old_width - 1 - box[0]] + for box in boxes + ] + ) old_width, old_height = old_height, old_width return image, masks, boxes, labels @@ -505,11 +515,11 @@ class PhotometricDistort(object): def __init__(self): self.pd = [ RandomContrast(), - ConvertColor(transform='HSV'), + ConvertColor(transform="HSV"), RandomSaturation(), RandomHue(), - ConvertColor(current='HSV', transform='BGR'), - RandomContrast() + ConvertColor(current="HSV", transform="BGR"), + RandomContrast(), ] self.rand_brightness = RandomBrightness() self.rand_light_noise = RandomLightingNoise() @@ -524,6 +534,7 @@ def __call__(self, image, masks, boxes, labels): im, masks, boxes, labels = distort(im, masks, boxes, labels) return self.rand_light_noise(im, masks, boxes, labels) + class PrepareMasks(object): """ Prepares the gt masks for use_gt_bboxes by cropping with the gt box @@ -538,7 +549,7 @@ def __init__(self, mask_size, use_gt_bboxes): def __call__(self, image, masks, boxes, labels=None): if not self.use_gt_bboxes: return image, masks, boxes, labels - + height, width, _ = image.shape new_masks = np.zeros((masks.shape[0], self.mask_size ** 2)) @@ -552,17 +563,18 @@ def __call__(self, image, masks, boxes, labels=None): x1, y1, x2, y2 = (int(x1), int(y1), int(x2), int(y2)) # +1 So that if y1=10.6 and y2=10.9 we still have a bounding box - cropped_mask = masks[i, y1:(y2+1), x1:(x2+1)] + cropped_mask = masks[i, y1 : (y2 + 1), x1 : (x2 + 1)] scaled_mask = cv2.resize(cropped_mask, (self.mask_size, self.mask_size)) new_masks[i, :] = scaled_mask.reshape(1, -1) - + # Binarize - new_masks[new_masks > 0.5] = 1 + new_masks[new_masks > 0.5] = 1 new_masks[new_masks <= 0.5] = 0 return image, new_masks, boxes, labels + class BackboneTransform(object): """ Transforms a BRG image made of floats in the range [0, 255] to whatever @@ -571,14 +583,17 @@ class BackboneTransform(object): transform is a transform config object (see config.py). in_channel_order is probably 'BGR' but you do you, kid. """ + def __init__(self, transform, mean, std, in_channel_order): self.mean = np.array(mean, dtype=np.float32) - self.std = np.array(std, dtype=np.float32) + self.std = np.array(std, dtype=np.float32) self.transform = transform # Here I use "Algorithms and Coding" to convert string permutations to numbers self.channel_map = {c: idx for idx, c in enumerate(in_channel_order)} - self.channel_permutation = [self.channel_map[c] for c in transform.channel_order] + self.channel_permutation = [ + self.channel_map[c] for c in transform.channel_order + ] def __call__(self, img, masks=None, boxes=None, labels=None): @@ -587,7 +602,7 @@ def __call__(self, img, masks=None, boxes=None, labels=None): if self.transform.normalize: img = (img - self.mean) / self.std elif self.transform.subtract_means: - img = (img - self.mean) + img = img - self.mean elif self.transform.to_float: img = img / 255 @@ -596,23 +611,25 @@ def __call__(self, img, masks=None, boxes=None, labels=None): return img.astype(np.float32), masks, boxes, labels - - class BaseTransform(object): """ Transorm to be used when evaluating. """ def __init__(self, mean=MEANS, std=STD): - self.augment = Compose([ - ConvertFromInts(), - Resize(resize_gt=False), - BackboneTransform(cfg.backbone.transform, mean, std, 'BGR') - ]) + self.augment = Compose( + [ + ConvertFromInts(), + Resize(resize_gt=False), + BackboneTransform(cfg.backbone.transform, mean, std, "BGR"), + ] + ) def __call__(self, img, masks=None, boxes=None, labels=None): return self.augment(img, masks, boxes, labels) + import torch.nn.functional as F + class FastBaseTransform(torch.nn.Module): """ Transform that does all operations on the GPU for super speed. @@ -624,39 +641,40 @@ def __init__(self): super().__init__() self.mean = torch.Tensor(MEANS).float().cuda()[None, :, None, None] - self.std = torch.Tensor( STD ).float().cuda()[None, :, None, None] + self.std = torch.Tensor(STD).float().cuda()[None, :, None, None] self.transform = cfg.backbone.transform def forward(self, img): self.mean = self.mean.to(img.device) - self.std = self.std.to(img.device) - + self.std = self.std.to(img.device) + # img assumed to be a pytorch BGR image with channel order [n, h, w, c] if cfg.preserve_aspect_ratio: _, h, w, _ = img.size() img_size = Resize.calc_size_preserve_ar(w, h, cfg.max_size) - img_size = (img_size[1], img_size[0]) # Pytorch needs h, w + img_size = (img_size[1], img_size[0]) # Pytorch needs h, w else: img_size = (cfg.max_size, cfg.max_size) img = img.permute(0, 3, 1, 2).contiguous() - img = F.interpolate(img, img_size, mode='bilinear', align_corners=False) + img = F.interpolate(img, img_size, mode="bilinear", align_corners=False) if self.transform.normalize: img = (img - self.mean) / self.std elif self.transform.subtract_means: - img = (img - self.mean) + img = img - self.mean elif self.transform.to_float: img = img / 255 - - if self.transform.channel_order != 'RGB': + + if self.transform.channel_order != "RGB": raise NotImplementedError - + img = img[:, (2, 1, 0), :, :].contiguous() # Return value is in channel order [n, c, h, w] and RGB return img + def do_nothing(img=None, masks=None, boxes=None, labels=None): return img, masks, boxes, labels @@ -664,25 +682,30 @@ def do_nothing(img=None, masks=None, boxes=None, labels=None): def enable_if(condition, obj): return obj if condition else do_nothing + class SSDAugmentation(object): """ Transform to be used when training. """ def __init__(self, mean=MEANS, std=STD): - self.augment = Compose([ - ConvertFromInts(), - ToAbsoluteCoords(), - enable_if(cfg.augment_photometric_distort, PhotometricDistort()), - enable_if(cfg.augment_expand, Expand(mean)), - enable_if(cfg.augment_random_sample_crop, RandomSampleCrop()), - enable_if(cfg.augment_random_mirror, RandomMirror()), - enable_if(cfg.augment_random_flip, RandomFlip()), - enable_if(cfg.augment_random_flip, RandomRot90()), - Resize(), - enable_if(not cfg.preserve_aspect_ratio, Pad(cfg.max_size, cfg.max_size, mean)), - ToPercentCoords(), - PrepareMasks(cfg.mask_size, cfg.use_gt_bboxes), - BackboneTransform(cfg.backbone.transform, mean, std, 'BGR') - ]) + self.augment = Compose( + [ + ConvertFromInts(), + ToAbsoluteCoords(), + enable_if(cfg.augment_photometric_distort, PhotometricDistort()), + enable_if(cfg.augment_expand, Expand(mean)), + enable_if(cfg.augment_random_sample_crop, RandomSampleCrop()), + enable_if(cfg.augment_random_mirror, RandomMirror()), + enable_if(cfg.augment_random_flip, RandomFlip()), + enable_if(cfg.augment_random_flip, RandomRot90()), + Resize(), + enable_if( + not cfg.preserve_aspect_ratio, Pad(cfg.max_size, cfg.max_size, mean) + ), + ToPercentCoords(), + PrepareMasks(cfg.mask_size, cfg.use_gt_bboxes), + BackboneTransform(cfg.backbone.transform, mean, std, "BGR"), + ] + ) def __call__(self, img, masks, boxes, labels): return self.augment(img, masks, boxes, labels) diff --git a/utils/functions.py b/utils/functions.py index 3b7a4e45a..b0276d654 100644 --- a/utils/functions.py +++ b/utils/functions.py @@ -6,7 +6,8 @@ from pathlib import Path from layers.interpolate import InterpolateModule -class MovingAverage(): + +class MovingAverage: """ Keeps an average window of the specified number of items. """ def __init__(self, max_window_size=1000): @@ -16,15 +17,15 @@ def __init__(self, max_window_size=1000): def add(self, elem): """ Adds an element to the window, removing the earliest element if necessary. """ if not math.isfinite(elem): - print('Warning: Moving average ignored a value of %f' % elem) + print("Warning: Moving average ignored a value of %f" % elem) return - + self.window.append(elem) self.sum += elem if len(self.window) > self.max_window_size: self.sum -= self.window.popleft() - + def append(self, elem): """ Same as add just more pythonic. """ self.add(elem) @@ -40,22 +41,22 @@ def get_avg(self): def __str__(self): return str(self.get_avg()) - + def __repr__(self): return repr(self.get_avg()) - + def __len__(self): return len(self.window) -class ProgressBar(): +class ProgressBar: """ A simple progress bar that just outputs a string. """ def __init__(self, length, max_val): self.max_val = max_val self.length = length self.cur_val = 0 - + self.cur_num_bars = -1 self._update_str() @@ -68,7 +69,7 @@ def set_val(self, new_val): self.cur_val = 0 self._update_str() - + def is_finished(self): return self.cur_val == self.max_val @@ -77,11 +78,11 @@ def _update_str(self): if num_bars != self.cur_num_bars: self.cur_num_bars = num_bars - self.string = '█' * num_bars + '░' * (self.length - num_bars) - + self.string = "█" * num_bars + "░" * (self.length - num_bars) + def __repr__(self): return self.string - + def __str__(self): return self.string @@ -90,8 +91,9 @@ def init_console(): """ Initialize the console to be able to use ANSI escape characters on Windows. """ - if os.name == 'nt': + if os.name == "nt": from colorama import init + init() @@ -102,72 +104,76 @@ class SavePath: What am I doing with my life? """ - def __init__(self, model_name:str, epoch:int, iteration:int): + def __init__(self, model_name: str, epoch: int, iteration: int): self.model_name = model_name self.epoch = epoch self.iteration = iteration - def get_path(self, root:str=''): - file_name = self.model_name + '_' + str(self.epoch) + '_' + str(self.iteration) + '.pth' + def get_path(self, root: str = ""): + file_name = ( + self.model_name + "_" + str(self.epoch) + "_" + str(self.iteration) + ".pth" + ) return os.path.join(root, file_name) @staticmethod - def from_str(path:str): + def from_str(path: str): file_name = os.path.basename(path) - - if file_name.endswith('.pth'): + + if file_name.endswith(".pth"): file_name = file_name[:-4] - - params = file_name.split('_') - if file_name.endswith('interrupt'): + params = file_name.split("_") + + if file_name.endswith("interrupt"): params = params[:-1] - - model_name = '_'.join(params[:-2]) + + model_name = "_".join(params[:-2]) epoch = params[-2] iteration = params[-1] - + return SavePath(model_name, int(epoch), int(iteration)) @staticmethod def remove_interrupt(save_folder): - for p in Path(save_folder).glob('*_interrupt.pth'): + for p in Path(save_folder).glob("*_interrupt.pth"): p.unlink() - + @staticmethod def get_interrupt(save_folder): - for p in Path(save_folder).glob('*_interrupt.pth'): + for p in Path(save_folder).glob("*_interrupt.pth"): return str(p) return None - + @staticmethod def get_latest(save_folder, config): """ Note: config should be config.name. """ max_iter = -1 max_name = None - for p in Path(save_folder).glob(config + '_*'): + for p in Path(save_folder).glob(config + "_*"): path_name = str(p) try: save = SavePath.from_str(path_name) except: - continue - + continue + if save.model_name == config and save.iteration > max_iter: max_iter = save.iteration max_name = path_name return max_name + def make_net(in_channels, conf, include_last_relu=True): """ A helper function to take a config setting and turn it into a network. Used by protonet and extrahead. Returns (network, out_channels) """ + def make_layer(layer_cfg): nonlocal in_channels - + # Possible patterns: # ( 256, 3, {}) -> conv # ( 256,-2, {}) -> deconv @@ -179,7 +185,7 @@ def make_layer(layer_cfg): if isinstance(layer_cfg[0], str): layer_name = layer_cfg[0] - if layer_name == 'cat': + if layer_name == "cat": nets = [make_net(in_channels, x) for x in layer_cfg[1]] layer = Concat([net[0] for net in nets], layer_cfg[2]) num_channels = sum([net[1] for net in nets]) @@ -188,13 +194,22 @@ def make_layer(layer_cfg): kernel_size = layer_cfg[1] if kernel_size > 0: - layer = nn.Conv2d(in_channels, num_channels, kernel_size, **layer_cfg[2]) + layer = nn.Conv2d( + in_channels, num_channels, kernel_size, **layer_cfg[2] + ) else: if num_channels is None: - layer = InterpolateModule(scale_factor=-kernel_size, mode='bilinear', align_corners=False, **layer_cfg[2]) + layer = InterpolateModule( + scale_factor=-kernel_size, + mode="bilinear", + align_corners=False, + **layer_cfg[2] + ) else: - layer = nn.ConvTranspose2d(in_channels, num_channels, -kernel_size, **layer_cfg[2]) - + layer = nn.ConvTranspose2d( + in_channels, num_channels, -kernel_size, **layer_cfg[2] + ) + in_channels = num_channels if num_channels is not None else in_channels # Don't return a ReLU layer if we're doing an upsample. This probably doesn't affect anything @@ -210,4 +225,4 @@ def make_layer(layer_cfg): if not include_last_relu: net = net[:-1] - return nn.Sequential(*(net)), in_channels \ No newline at end of file + return nn.Sequential(*(net)), in_channels diff --git a/utils/logger.py b/utils/logger.py index 6d87f576b..1ac28db63 100644 --- a/utils/logger.py +++ b/utils/logger.py @@ -11,13 +11,14 @@ import numpy as np # Because Python's package heierarchy system sucks -if __name__ == '__main__': +if __name__ == "__main__": from nvinfo import gpu_info, visible_gpus, nvsmi_available from functions import MovingAverage else: from .nvinfo import gpu_info, visible_gpus, nvsmi_available from .functions import MovingAverage + class Log: """ A class to log information during training per information and save it out. @@ -31,71 +32,77 @@ class Log: - log_time: Also log the time in each iteration. """ - def __init__(self, log_name:str, log_dir:str='logs/', session_data:dict={}, - overwrite:bool=False, log_gpu_stats:bool=True, log_time:bool=True): - + def __init__( + self, + log_name: str, + log_dir: str = "logs/", + session_data: dict = {}, + overwrite: bool = False, + log_gpu_stats: bool = True, + log_time: bool = True, + ): + if log_gpu_stats and not nvsmi_available(): - print('Warning: Log created with log_gpu_stats=True, but nvidia-smi ' \ - 'was not found. Setting log_gpu_stats to False.') + print( + "Warning: Log created with log_gpu_stats=True, but nvidia-smi " + "was not found. Setting log_gpu_stats to False." + ) log_gpu_stats = False - + if not os.path.exists(log_dir): os.makedirs(log_dir) - self.log_path = os.path.join(log_dir, log_name + '.log') + self.log_path = os.path.join(log_dir, log_name + ".log") # if os.path.exists(self.log_path) and overwrite: # os.unlink(self.log_path) if os.path.exists(self.log_path): # Log already exists, so we're going to add to it. Increment the session counter. - with open(self.log_path, 'r') as f: - for last in f: pass + with open(self.log_path, "r") as f: + for last in f: + pass if len(last) > 1: - self.session = json.loads(last)['session'] + 1 + self.session = json.loads(last)["session"] + 1 else: self.session = 0 else: self.session = 0 - self.log_gpu_stats = log_gpu_stats self.log_time = log_time if self.log_gpu_stats: self.visible_gpus = visible_gpus() - self._log_session_header(session_data) - - def _log_session_header(self, session_data:dict): + def _log_session_header(self, session_data: dict): """ Log information that does not change between iterations here. This is to cut down on the file size so you're not outputing this every iteration. """ info = {} - info['type'] = 'session' - info['session'] = self.session + info["type"] = "session" + info["session"] = self.session - info['data'] = session_data + info["data"] = session_data if self.log_gpu_stats: - keys = ['idx', 'name', 'uuid', 'pwr_cap', 'mem_total'] + keys = ["idx", "name", "uuid", "pwr_cap", "mem_total"] gpus = gpu_info() - info['gpus'] = [{k: gpus[i][k] for k in keys} for i in self.visible_gpus] - + info["gpus"] = [{k: gpus[i][k] for k in keys} for i in self.visible_gpus] + if self.log_time: - info['time'] = time.time() + info["time"] = time.time() - out = json.dumps(info) + '\n' + out = json.dumps(info) + "\n" - with open(self.log_path, 'a') as f: + with open(self.log_path, "a") as f: f.write(out) - - def log(self, type:str, data:dict={}, **kwdargs): + def log(self, type: str, data: dict = {}, **kwdargs): """ Add an iteration to the log with the specified data points. Type should be the type of information this is (e.g., train, valid, etc.) @@ -104,89 +111,88 @@ def log(self, type:str, data:dict={}, **kwdargs): Values should be json-serializable. """ info = {} - - info['type'] = type - info['session'] = self.session + + info["type"] = type + info["session"] = self.session kwdargs.update(data) - info['data'] = kwdargs + info["data"] = kwdargs if self.log_gpu_stats: - keys = ['fan_spd', 'temp', 'pwr_used', 'mem_used', 'util'] - + keys = ["fan_spd", "temp", "pwr_used", "mem_used", "util"] + gpus = gpu_info() - info['gpus'] = [{k: gpus[i][k] for k in keys} for i in self.visible_gpus] - + info["gpus"] = [{k: gpus[i][k] for k in keys} for i in self.visible_gpus] + if self.log_time: - info['time'] = time.time() - - - out = json.dumps(info) + '\n' + info["time"] = time.time() - with open(self.log_path, 'a') as f: + out = json.dumps(info) + "\n" + + with open(self.log_path, "a") as f: f.write(out) -class LogEntry(): +class LogEntry: """ A class that allows you to navigate a dictonary using x.a.b[2].c, etc. """ - def __init__(self, entry:Union[dict, list]): + def __init__(self, entry: Union[dict, list]): self._ = entry def __getattr__(self, name): - if name == '_': - return self.__dict__['_'] + if name == "_": + return self.__dict__["_"] - res = self.__dict__['_'][name] + res = self.__dict__["_"][name] if type(res) == dict or type(res) == list: return LogEntry(res) else: return res - + def __getitem__(self, name): return self.__getattr__(name) def __len__(self): - return len(self.__dict__['_']) + return len(self.__dict__["_"]) -class LogVisualizer(): + +class LogVisualizer: COLORS = [ - 'xkcd:azure', - 'xkcd:coral', - 'xkcd:turquoise', - 'xkcd:orchid', - 'xkcd:orange', - - 'xkcd:blue', - 'xkcd:red', - 'xkcd:teal', - 'xkcd:magenta', - 'xkcd:orangered' + "xkcd:azure", + "xkcd:coral", + "xkcd:turquoise", + "xkcd:orchid", + "xkcd:orange", + "xkcd:blue", + "xkcd:red", + "xkcd:teal", + "xkcd:magenta", + "xkcd:orangered", ] def __init__(self): self.logs = [] self.total_logs = [] self.log_names = [] - - def _decode(self, query:str) -> list: - path, select = (query.split(';') + [''])[:2] - - if select.strip() == '': + + def _decode(self, query: str) -> list: + path, select = (query.split(";") + [""])[:2] + + if select.strip() == "": select = lambda x, s: True else: - select = eval('lambda x, s: ' + select) + select = eval("lambda x, s: " + select) - if path.strip() == '': + if path.strip() == "": path = lambda x, s: x else: - path = eval('lambda x, s: ' + path) - + path = eval("lambda x, s: " + path) + return path, select - def _follow(self, entry:LogEntry, query:list): + def _follow(self, entry: LogEntry, query: list): path, select = query try: @@ -194,7 +200,7 @@ def _follow(self, entry:LogEntry, query:list): res = path(entry, entry._s) if type(res) == LogEntry: - return res.__dict__['_'] + return res.__dict__["_"] else: return res else: @@ -202,14 +208,14 @@ def _follow(self, entry:LogEntry, query:list): except (KeyError, IndexError): return None - def _color(self, idx:int): + def _color(self, idx: int): return self.COLORS[idx % len(self.COLORS)] - def sessions(self, path:str): + def sessions(self, path: str): """ Prints statistics about the sessions in the file. """ if not os.path.exists(path): - print(path + ' doesn\'t exist!') + print(path + " doesn't exist!") return cur_session = None @@ -219,75 +225,84 @@ def sessions(self, path:str): def pop_session(): delta = last_time - cur_time - time_str = str(datetime.timedelta(seconds=delta)).split('.')[0] - print('Session % 3d: % 8d entries | %s elapsed' % (cur_session, num_entries, time_str)) + time_str = str(datetime.timedelta(seconds=delta)).split(".")[0] + print( + "Session % 3d: % 8d entries | %s elapsed" + % (cur_session, num_entries, time_str) + ) - with open(path, 'r') as f: + with open(path, "r") as f: for line in f: line = line.strip() if len(line) > 0: js = json.loads(line) - if js['type'] == 'session': + if js["type"] == "session": if cur_session is not None: pop_session() - cur_time = js['time'] - cur_session = js['session'] + cur_time = js["time"] + cur_session = js["session"] num_entries = 0 - last_time = js['time'] + last_time = js["time"] num_entries += 1 - + pop_session() - def add(self, path:str, session:Union[int,list]=None): + def add(self, path: str, session: Union[int, list] = None): """ Add a log file to the list of logs being considered. """ log = defaultdict(lambda: []) total_log = [] if not os.path.exists(path): - print(path + ' doesn\'t exist!') + print(path + " doesn't exist!") return session_idx = 0 ignoring = True - + def valid(idx): if session is None: return True elif type(session) == int: - return (idx == session) + return idx == session else: return idx in session - with open(path, 'r') as f: + with open(path, "r") as f: for line in f: line = line.strip() if len(line) > 0: js = json.loads(line) - - _type = js['type'] - if _type == 'session': - session_idx = js['session'] + + _type = js["type"] + if _type == "session": + session_idx = js["session"] ignoring = not valid(session_idx) if not ignoring: ljs = LogEntry(js) - if _type == 'session': - js['_s'] = ljs + if _type == "session": + js["_s"] = ljs else: - js['_s'] =log['session'][-1] + js["_s"] = log["session"][-1] log[_type].append(ljs) total_log.append(ljs) - + name = os.path.basename(path) if session is not None: - name += ' (Session %s)' % session + name += " (Session %s)" % session self.logs.append(log) self.total_logs.append(total_log) self.log_names.append(name) - def query(self, x:Union[str, list], entry_type:str=None, x_idx:int=None, log_idx:int=None) -> list: + def query( + self, + x: Union[str, list], + entry_type: str = None, + x_idx: int = None, + log_idx: int = None, + ) -> list: """ Given a query string (can be already decoded for faster computation), query the entire log and return all values found by that query. If both log_idx and x_idx is None, this will be @@ -298,27 +313,31 @@ def query(self, x:Union[str, list], entry_type:str=None, x_idx:int=None, log_idx if type(x) is not list: x = self._decode(x) - + res = [] - for idx in (range(len(self.logs)) if log_idx is None else [log_idx]): + for idx in range(len(self.logs)) if log_idx is None else [log_idx]: candidates = [] - log = self.total_logs[idx] if entry_type is None else self.logs[idx][entry_type] + log = ( + self.total_logs[idx] + if entry_type is None + else self.logs[idx][entry_type] + ) for entry in log: candidate = self._follow(entry, x) if candidate is not None: candidates.append(candidate) - + if x_idx is not None: candidates = candidates[x_idx] res.append(candidates) - + if log_idx is not None: res = res[0] return res - def check(self, entry_type:str, x:str): + def check(self, entry_type: str, x: str): """ Checks the log for the valid keys for this input. """ keys = set() x = self._decode(x) @@ -331,11 +350,11 @@ def check(self, entry_type:str, x:str): for key in res.keys(): keys.add(key) elif type(res) == list: - keys.add('< %d' % len(res)) - + keys.add("< %d" % len(res)) + return list(keys) - def plot(self, entry_type:str, x:str, y:str, smoothness:int=0): + def plot(self, entry_type: str, x: str, y: str, smoothness: int = 0): """ Plot sequential log data. """ query_x = self._decode(x) @@ -361,18 +380,25 @@ def plot(self, entry_type:str, x:str, y:str, smoothness:int=0): if len(avg) < smoothness // 10: continue - + _x.append(val_x) _y.append(val_y) - + plt.plot(_x, _y, color=self._color(idx), label=name) - - plt.title(y.replace('x.', entry_type + '.')) + + plt.title(y.replace("x.", entry_type + ".")) plt.legend() - plt.grid(linestyle=':', linewidth=0.5) + plt.grid(linestyle=":", linewidth=0.5) plt.show() - def bar(self, entry_type:str, x:str, labels:list=None, diff:bool=False, x_idx:int=-1): + def bar( + self, + entry_type: str, + x: str, + labels: list = None, + diff: bool = False, + x_idx: int = -1, + ): """ Plot a bar chart. The result of x should be list or dictionary. """ query = self._decode(x) @@ -391,12 +417,12 @@ def bar(self, entry_type:str, x:str, labels:list=None, diff:bool=False, x_idx:in candidates.append(test) elif type(test) == list: candidates.append({idx: v for idx, v in enumerate(test)}) - + if len(candidates) > 0: data_points.append((name, candidates[x_idx])) - + if len(data_points) == 0: - print('Warning: Nothing to show in bar chart!') + print("Warning: Nothing to show in bar chart!") return names = [x[0] for x in data_points] @@ -410,12 +436,14 @@ def bar(self, entry_type:str, x:str, labels:list=None, diff:bool=False, x_idx:in for datum in data_points: for k in datum: data_labels.add(k) - + data_labels = list(data_labels) data_labels.sort() - - data_values = [[(datum[k] if k in datum else None) for k in data_labels] for datum in data_points] + data_values = [ + [(datum[k] if k in datum else None) for k in data_labels] + for datum in data_points + ] if diff: for idx in reversed(range(len(data_values))): @@ -425,33 +453,41 @@ def bar(self, entry_type:str, x:str, labels:list=None, diff:bool=False, x_idx:in else: data_values[idx][jdx] -= data_values[0][jdx] - series_labels = names # Plot the graph now num_bars = len(series_labels) bar_width = 1 / (num_bars + 1) - + # Set position of bar on X axis positions = [np.arange(len(data_labels))] for _ in range(1, num_bars): positions.append([x + bar_width for x in positions[-1]]) - + # Make the plot - for idx, (series, data, pos) in enumerate(zip(series_labels, data_values, positions)): - plt.bar(pos, data, color=self._color(idx), width=bar_width, edgecolor='white', label=series) - + for idx, (series, data, pos) in enumerate( + zip(series_labels, data_values, positions) + ): + plt.bar( + pos, + data, + color=self._color(idx), + width=bar_width, + edgecolor="white", + label=series, + ) + # Add xticks on the middle of the group bars - plt.title(x.replace('x.', entry_type + '.') + (' diff' if diff else '')) + plt.title(x.replace("x.", entry_type + ".") + (" diff" if diff else "")) plt.xticks([r + bar_width for r in range(len(data_labels))], data_labels) - + # Create legend & Show graphic plt.legend() plt.show() - - - def elapsed_time(self, cond1:str='', cond2:str='', legible:bool=True) -> list: + def elapsed_time( + self, cond1: str = "", cond2: str = "", legible: bool = True + ) -> list: """ Returns the elapsed time between two entries based on the given conditionals. If a query isn't specified, the first / last entry will be used. The first query @@ -459,31 +495,26 @@ def elapsed_time(self, cond1:str='', cond2:str='', legible:bool=True) -> list: Setting legible to true returns human-readable results, while false returns seconds. """ - q1 = 'x.time; ' + cond1 - q2 = 'x.time; ' + cond2 + q1 = "x.time; " + cond1 + q2 = "x.time; " + cond2 x1 = self.query(q1, x_idx=0) x2 = self.query(q2, x_idx=-1) - - diff = (lambda x: str(datetime.timedelta(seconds=x)).split('.')[0]) if legible else lambda x: x - - return [diff(b - a) for a, b in zip(x1, x2)] - - - - - - - + diff = ( + (lambda x: str(datetime.timedelta(seconds=x)).split(".")[0]) + if legible + else lambda x: x + ) + return [diff(b - a) for a, b in zip(x1, x2)] -if __name__ == '__main__': - if len(sys.argv) < 4+1: - print('Usage: python utils/logger.py ') +if __name__ == "__main__": + if len(sys.argv) < 4 + 1: + print("Usage: python utils/logger.py ") exit() - + vis = LogVisualizer() vis.add(sys.argv[1]) vis.plot(sys.argv[2], sys.argv[3], sys.argv[4]) diff --git a/utils/nvinfo.py b/utils/nvinfo.py index 98dd5c980..3ca197b43 100644 --- a/utils/nvinfo.py +++ b/utils/nvinfo.py @@ -4,60 +4,69 @@ import shutil import os + def gpu_info() -> list: """ Returns a dictionary of stats mined from nvidia-smi for each gpu in a list. Adapted from nvgpu: https://pypi.org/project/nvgpu/, but mine has more info. """ - gpus = [line for line in _run_cmd(['nvidia-smi', '-L']) if line] - gpu_infos = [re.match('GPU ([0-9]+): ([^(]+) \(UUID: ([^)]+)\)', gpu).groups() for gpu in gpus] - gpu_infos = [dict(zip(['idx', 'name', 'uuid'], info)) for info in gpu_infos] + gpus = [line for line in _run_cmd(["nvidia-smi", "-L"]) if line] + gpu_infos = [ + re.match("GPU ([0-9]+): ([^(]+) \(UUID: ([^)]+)\)", gpu).groups() + for gpu in gpus + ] + gpu_infos = [dict(zip(["idx", "name", "uuid"], info)) for info in gpu_infos] gpu_count = len(gpus) - lines = _run_cmd(['nvidia-smi']) - selected_lines = lines[7:7 + 3 * gpu_count] + lines = _run_cmd(["nvidia-smi"]) + selected_lines = lines[7 : 7 + 3 * gpu_count] for i in range(gpu_count): - mem_used, mem_total = [int(m.strip().replace('MiB', '')) for m in - selected_lines[3 * i + 1].split('|')[2].strip().split('/')] - - pw_tmp_info, mem_info, util_info = [x.strip() for x in selected_lines[3 * i + 1].split('|')[1:-1]] - - pw_tmp_info = [x[:-1] for x in pw_tmp_info.split(' ') if len(x) > 0] - fan_speed, temperature, pwr_used, pwr_cap = [int(pw_tmp_info[i]) for i in (0, 1, 3, 5)] - gpu_infos[i]['fan_spd' ] = fan_speed - gpu_infos[i]['temp' ] = temperature - gpu_infos[i]['pwr_used'] = pwr_used - gpu_infos[i]['pwr_cap' ] = pwr_cap - - mem_used, mem_total = [int(x) for x in mem_info.replace('MiB', '').split(' / ')] - gpu_infos[i]['mem_used' ] = mem_used - gpu_infos[i]['mem_total'] = mem_total - - utilization = int(util_info.split(' ')[0][:-1]) - gpu_infos[i]['util'] = utilization - - gpu_infos[i]['idx'] = int(gpu_infos[i]['idx']) + mem_used, mem_total = [ + int(m.strip().replace("MiB", "")) + for m in selected_lines[3 * i + 1].split("|")[2].strip().split("/") + ] + + pw_tmp_info, mem_info, util_info = [ + x.strip() for x in selected_lines[3 * i + 1].split("|")[1:-1] + ] + + pw_tmp_info = [x[:-1] for x in pw_tmp_info.split(" ") if len(x) > 0] + fan_speed, temperature, pwr_used, pwr_cap = [ + int(pw_tmp_info[i]) for i in (0, 1, 3, 5) + ] + gpu_infos[i]["fan_spd"] = fan_speed + gpu_infos[i]["temp"] = temperature + gpu_infos[i]["pwr_used"] = pwr_used + gpu_infos[i]["pwr_cap"] = pwr_cap + + mem_used, mem_total = [int(x) for x in mem_info.replace("MiB", "").split(" / ")] + gpu_infos[i]["mem_used"] = mem_used + gpu_infos[i]["mem_total"] = mem_total + + utilization = int(util_info.split(" ")[0][:-1]) + gpu_infos[i]["util"] = utilization + + gpu_infos[i]["idx"] = int(gpu_infos[i]["idx"]) return gpu_infos + def nvsmi_available() -> bool: """ Returns whether or not nvidia-smi is present in this system's PATH. """ - return shutil.which('nvidia-smi') is not None + return shutil.which("nvidia-smi") is not None def visible_gpus() -> list: """ Returns a list of the indexes of all the gpus visible to pytorch. """ - if 'CUDA_VISIBLE_DEVICES' not in os.environ: + if "CUDA_VISIBLE_DEVICES" not in os.environ: return list(range(len(gpu_info()))) else: - return [int(x.strip()) for x in os.environ['CUDA_VISIBLE_DEVICES'].split(',')] - - + return [int(x.strip()) for x in os.environ["CUDA_VISIBLE_DEVICES"].split(",")] -def _run_cmd(cmd:list) -> list: +def _run_cmd(cmd: list) -> list: """ Runs a command and returns a list of output lines. """ output = subprocess.check_output(cmd) - output = output.decode('UTF-8') - return output.split('\n') \ No newline at end of file + output = output.decode("UTF-8") + return output.split("\n") diff --git a/utils/timer.py b/utils/timer.py index 7c1c403dd..f1e3bfb65 100644 --- a/utils/timer.py +++ b/utils/timer.py @@ -1,131 +1,145 @@ import time from collections import defaultdict -_total_times = defaultdict(lambda: 0) +_total_times = defaultdict(lambda: 0) _start_times = defaultdict(lambda: -1) _disabled_names = set() _timer_stack = [] _running_timer = None _disable_all = False + def disable_all(): - global _disable_all - _disable_all = True + global _disable_all + _disable_all = True + def enable_all(): - global _disable_all - _disable_all = False + global _disable_all + _disable_all = False + def disable(fn_name): - """ Disables the given function name fom being considered for the average or outputted in print_stats. """ - _disabled_names.add(fn_name) + """ Disables the given function name fom being considered for the average or outputted in print_stats. """ + _disabled_names.add(fn_name) + def enable(fn_name): - """ Enables function names disabled by disable. """ - _disabled_names.remove(fn_name) + """ Enables function names disabled by disable. """ + _disabled_names.remove(fn_name) + def reset(): - """ Resets the current timer. Call this at the start of an iteration. """ - global _running_timer - _total_times.clear() - _start_times.clear() - _timer_stack.clear() - _running_timer = None + """ Resets the current timer. Call this at the start of an iteration. """ + global _running_timer + _total_times.clear() + _start_times.clear() + _timer_stack.clear() + _running_timer = None + def start(fn_name, use_stack=True): - """ + """ Start timing the specific function. Note: If use_stack is True, only one timer can be active at a time. Once you stop this timer, the previous one will start again. """ - global _running_timer, _disable_all - - if _disable_all: - return - - if use_stack: - if _running_timer is not None: - stop(_running_timer, use_stack=False) - _timer_stack.append(_running_timer) - start(fn_name, use_stack=False) - _running_timer = fn_name - else: - _start_times[fn_name] = time.perf_counter() + global _running_timer, _disable_all + + if _disable_all: + return + + if use_stack: + if _running_timer is not None: + stop(_running_timer, use_stack=False) + _timer_stack.append(_running_timer) + start(fn_name, use_stack=False) + _running_timer = fn_name + else: + _start_times[fn_name] = time.perf_counter() + def stop(fn_name=None, use_stack=True): - """ + """ If use_stack is True, this will stop the currently running timer and restore the previous timer on the stack if that exists. Note if use_stack is True, fn_name will be ignored. If use_stack is False, this will just stop timing the timer fn_name. """ - global _running_timer, _disable_all - - if _disable_all: - return - - if use_stack: - if _running_timer is not None: - stop(_running_timer, use_stack=False) - if len(_timer_stack) > 0: - _running_timer = _timer_stack.pop() - start(_running_timer, use_stack=False) - else: - _running_timer = None - else: - print('Warning: timer stopped with no timer running!') - else: - if _start_times[fn_name] > -1: - _total_times[fn_name] += time.perf_counter() - _start_times[fn_name] - else: - print('Warning: timer for %s stopped before starting!' % fn_name) + global _running_timer, _disable_all + + if _disable_all: + return + + if use_stack: + if _running_timer is not None: + stop(_running_timer, use_stack=False) + if len(_timer_stack) > 0: + _running_timer = _timer_stack.pop() + start(_running_timer, use_stack=False) + else: + _running_timer = None + else: + print("Warning: timer stopped with no timer running!") + else: + if _start_times[fn_name] > -1: + _total_times[fn_name] += time.perf_counter() - _start_times[fn_name] + else: + print("Warning: timer for %s stopped before starting!" % fn_name) def print_stats(): - """ Prints the current timing information into a table. """ - print() + """ Prints the current timing information into a table. """ + print() - all_fn_names = [k for k in _total_times.keys() if k not in _disabled_names] + all_fn_names = [k for k in _total_times.keys() if k not in _disabled_names] - max_name_width = max([len(k) for k in all_fn_names] + [4]) - if max_name_width % 2 == 1: max_name_width += 1 - format_str = ' {:>%d} | {:>10.4f} ' % max_name_width + max_name_width = max([len(k) for k in all_fn_names] + [4]) + if max_name_width % 2 == 1: + max_name_width += 1 + format_str = " {:>%d} | {:>10.4f} " % max_name_width - header = (' {:^%d} | {:^10} ' % max_name_width).format('Name', 'Time (ms)') - print(header) + header = (" {:^%d} | {:^10} " % max_name_width).format("Name", "Time (ms)") + print(header) - sep_idx = header.find('|') - sep_text = ('-' * sep_idx) + '+' + '-' * (len(header)-sep_idx-1) - print(sep_text) + sep_idx = header.find("|") + sep_text = ("-" * sep_idx) + "+" + "-" * (len(header) - sep_idx - 1) + print(sep_text) - for name in all_fn_names: - print(format_str.format(name, _total_times[name]*1000)) - - print(sep_text) - print(format_str.format('Total', total_time()*1000)) - print() + for name in all_fn_names: + print(format_str.format(name, _total_times[name] * 1000)) -def total_time(): - """ Returns the total amount accumulated across all functions in seconds. """ - return sum([elapsed_time for name, elapsed_time in _total_times.items() if name not in _disabled_names]) + print(sep_text) + print(format_str.format("Total", total_time() * 1000)) + print() -class env(): - """ +def total_time(): + """ Returns the total amount accumulated across all functions in seconds. """ + return sum( + [ + elapsed_time + for name, elapsed_time in _total_times.items() + if name not in _disabled_names + ] + ) + + +class env: + """ A class that lets you go: with timer.env(fn_name): # (...) That automatically manages a timer start and stop for you. """ - def __init__(self, fn_name, use_stack=True): - self.fn_name = fn_name - self.use_stack = use_stack - - def __enter__(self): - start(self.fn_name, use_stack=self.use_stack) + def __init__(self, fn_name, use_stack=True): + self.fn_name = fn_name + self.use_stack = use_stack - def __exit__(self, e, ev, t): - stop(self.fn_name, use_stack=self.use_stack) + def __enter__(self): + start(self.fn_name, use_stack=self.use_stack) + def __exit__(self, e, ev, t): + stop(self.fn_name, use_stack=self.use_stack) diff --git a/web/server.py b/web/server.py index e724da4cc..5c71d1ed9 100644 --- a/web/server.py +++ b/web/server.py @@ -3,56 +3,58 @@ import os PORT = 6337 -IMAGE_PATH = '../data/coco/images/' -IMAGE_FMT = '%012d.jpg' +IMAGE_PATH = "../data/coco/images/" +IMAGE_FMT = "%012d.jpg" + class Handler(SimpleHTTPRequestHandler): - - def do_GET(self): - if self.path == '/detindex': - self.send_str('\n'.join([p.name[:-5] for p in Path('dets/').glob('*.json')])) - elif self.path.startswith('/image'): - # Unsafe practices ahead! - path = self.translate_path(self.path).split('image') - self.send_file(os.path.join(path[0], IMAGE_PATH, IMAGE_FMT % int(path[1]))) - else: - super().do_GET() - - def send_str(self, string): - self.send_response(HTTPStatus.OK) - self.send_header('Content-type', 'text/plain') - self.send_header('Content-Length', str(len(string))) - self.send_header('Last-Modified', self.date_time_string()) - self.end_headers() - - self.wfile.write(string.encode()) - - def send_file(self, path): - try: - f = open(path, 'rb') - except OSError: - self.send_error(HTTPStatus.NOT_FOUND, "File not found") - return - - try: - self.send_response(HTTPStatus.OK) - self.send_header("Content-type", self.guess_type(path)) - fs = os.fstat(f.fileno()) - self.send_header("Content-Length", str(fs[6])) - self.send_header("Last-Modified", self.date_time_string(fs.st_mtime)) - self.end_headers() - - self.copyfile(f, self.wfile) - finally: - f.close() - - def send_response(self, code, message=None): - super().send_response(code, message) - - -with HTTPServer(('', PORT), Handler) as httpd: - print('Serving at port', PORT) - try: - httpd.serve_forever() - except KeyboardInterrupt: - pass + def do_GET(self): + if self.path == "/detindex": + self.send_str( + "\n".join([p.name[:-5] for p in Path("dets/").glob("*.json")]) + ) + elif self.path.startswith("/image"): + # Unsafe practices ahead! + path = self.translate_path(self.path).split("image") + self.send_file(os.path.join(path[0], IMAGE_PATH, IMAGE_FMT % int(path[1]))) + else: + super().do_GET() + + def send_str(self, string): + self.send_response(HTTPStatus.OK) + self.send_header("Content-type", "text/plain") + self.send_header("Content-Length", str(len(string))) + self.send_header("Last-Modified", self.date_time_string()) + self.end_headers() + + self.wfile.write(string.encode()) + + def send_file(self, path): + try: + f = open(path, "rb") + except OSError: + self.send_error(HTTPStatus.NOT_FOUND, "File not found") + return + + try: + self.send_response(HTTPStatus.OK) + self.send_header("Content-type", self.guess_type(path)) + fs = os.fstat(f.fileno()) + self.send_header("Content-Length", str(fs[6])) + self.send_header("Last-Modified", self.date_time_string(fs.st_mtime)) + self.end_headers() + + self.copyfile(f, self.wfile) + finally: + f.close() + + def send_response(self, code, message=None): + super().send_response(code, message) + + +with HTTPServer(("", PORT), Handler) as httpd: + print("Serving at port", PORT) + try: + httpd.serve_forever() + except KeyboardInterrupt: + pass diff --git a/yolact.py b/yolact.py index d83703bb7..fa6d73708 100644 --- a/yolact.py +++ b/yolact.py @@ -24,26 +24,27 @@ # As of March 10, 2019, Pytorch DataParallel still doesn't support JIT Script Modules use_jit = torch.cuda.device_count() <= 1 if not use_jit: - print('Multiple GPUs detected! Turning off JIT.') + print("Multiple GPUs detected! Turning off JIT.") ScriptModuleWrapper = torch.jit.ScriptModule if use_jit else nn.Module script_method_wrapper = torch.jit.script_method if use_jit else lambda fn, _rcn=None: fn - class Concat(nn.Module): def __init__(self, nets, extra_params): super().__init__() self.nets = nn.ModuleList(nets) self.extra_params = extra_params - + def forward(self, x): # Concat each along the channel dimension return torch.cat([net(x) for net in self.nets], dim=1, **self.extra_params) + prior_cache = defaultdict(lambda: None) + class PredictionModule(nn.Module): """ The (c) prediction module adapted from DSSD: @@ -69,23 +70,34 @@ class PredictionModule(nn.Module): - parent: If parent is a PredictionModule, this module will use all the layers from parent instead of from this module. """ - - def __init__(self, in_channels, out_channels=1024, aspect_ratios=[[1]], scales=[1], parent=None, index=0): + + def __init__( + self, + in_channels, + out_channels=1024, + aspect_ratios=[[1]], + scales=[1], + parent=None, + index=0, + ): super().__init__() self.num_classes = cfg.num_classes - self.mask_dim = cfg.mask_dim # Defined by Yolact - self.num_priors = sum(len(x)*len(scales) for x in aspect_ratios) - self.parent = [parent] # Don't include this in the state dict - self.index = index - self.num_heads = cfg.num_heads # Defined by Yolact - - if cfg.mask_proto_split_prototypes_by_head and cfg.mask_type == mask_type.lincomb: + self.mask_dim = cfg.mask_dim # Defined by Yolact + self.num_priors = sum(len(x) * len(scales) for x in aspect_ratios) + self.parent = [parent] # Don't include this in the state dict + self.index = index + self.num_heads = cfg.num_heads # Defined by Yolact + + if ( + cfg.mask_proto_split_prototypes_by_head + and cfg.mask_type == mask_type.lincomb + ): self.mask_dim = self.mask_dim // self.num_heads if cfg.mask_proto_prototypes_as_features: in_channels += self.mask_dim - + if parent is None: if cfg.extra_head_net is None: out_channels = in_channels @@ -94,34 +106,70 @@ def __init__(self, in_channels, out_channels=1024, aspect_ratios=[[1]], scales=[ if cfg.use_prediction_module: self.block = Bottleneck(out_channels, out_channels // 4) - self.conv = nn.Conv2d(out_channels, out_channels, kernel_size=1, bias=True) + self.conv = nn.Conv2d( + out_channels, out_channels, kernel_size=1, bias=True + ) self.bn = nn.BatchNorm2d(out_channels) - self.bbox_layer = nn.Conv2d(out_channels, self.num_priors * 4, **cfg.head_layer_params) - self.conf_layer = nn.Conv2d(out_channels, self.num_priors * self.num_classes, **cfg.head_layer_params) - self.mask_layer = nn.Conv2d(out_channels, self.num_priors * self.mask_dim, **cfg.head_layer_params) - + self.bbox_layer = nn.Conv2d( + out_channels, self.num_priors * 4, **cfg.head_layer_params + ) + self.conf_layer = nn.Conv2d( + out_channels, + self.num_priors * self.num_classes, + **cfg.head_layer_params + ) + self.mask_layer = nn.Conv2d( + out_channels, self.num_priors * self.mask_dim, **cfg.head_layer_params + ) + if cfg.use_mask_scoring: - self.score_layer = nn.Conv2d(out_channels, self.num_priors, **cfg.head_layer_params) + self.score_layer = nn.Conv2d( + out_channels, self.num_priors, **cfg.head_layer_params + ) if cfg.use_instance_coeff: - self.inst_layer = nn.Conv2d(out_channels, self.num_priors * cfg.num_instance_coeffs, **cfg.head_layer_params) - + self.inst_layer = nn.Conv2d( + out_channels, + self.num_priors * cfg.num_instance_coeffs, + **cfg.head_layer_params + ) + # What is this ugly lambda doing in the middle of all this clean prediction module code? def make_extra(num_layers): if num_layers == 0: return lambda x: x else: # Looks more complicated than it is. This just creates an array of num_layers alternating conv-relu - return nn.Sequential(*sum([[ - nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1), - nn.ReLU(inplace=True) - ] for _ in range(num_layers)], [])) + return nn.Sequential( + *sum( + [ + [ + nn.Conv2d( + out_channels, + out_channels, + kernel_size=3, + padding=1, + ), + nn.ReLU(inplace=True), + ] + for _ in range(num_layers) + ], + [], + ) + ) + + self.bbox_extra, self.conf_extra, self.mask_extra = [ + make_extra(x) for x in cfg.extra_layers + ] - self.bbox_extra, self.conf_extra, self.mask_extra = [make_extra(x) for x in cfg.extra_layers] - if cfg.mask_type == mask_type.lincomb and cfg.mask_proto_coeff_gate: - self.gate_layer = nn.Conv2d(out_channels, self.num_priors * self.mask_dim, kernel_size=3, padding=1) + self.gate_layer = nn.Conv2d( + out_channels, + self.num_priors * self.mask_dim, + kernel_size=3, + padding=1, + ) self.aspect_ratios = aspect_ratios self.scales = scales @@ -144,21 +192,21 @@ def forward(self, x): """ # In case we want to use another module's layers src = self if self.parent[0] is None else self.parent[0] - + conv_h = x.size(2) conv_w = x.size(3) - + if cfg.extra_head_net is not None: x = src.upfeature(x) - + if cfg.use_prediction_module: # The two branches of PM design (c) a = src.block(x) - + b = src.conv(x) b = src.bn(b) b = F.relu(b) - + # TODO: Possibly switch this out for a product x = a + b @@ -166,19 +214,46 @@ def forward(self, x): conf_x = src.conf_extra(x) mask_x = src.mask_extra(x) - bbox = src.bbox_layer(bbox_x).permute(0, 2, 3, 1).contiguous().view(x.size(0), -1, 4) - conf = src.conf_layer(conf_x).permute(0, 2, 3, 1).contiguous().view(x.size(0), -1, self.num_classes) - + bbox = ( + src.bbox_layer(bbox_x) + .permute(0, 2, 3, 1) + .contiguous() + .view(x.size(0), -1, 4) + ) + conf = ( + src.conf_layer(conf_x) + .permute(0, 2, 3, 1) + .contiguous() + .view(x.size(0), -1, self.num_classes) + ) + if cfg.eval_mask_branch: - mask = src.mask_layer(mask_x).permute(0, 2, 3, 1).contiguous().view(x.size(0), -1, self.mask_dim) + mask = ( + src.mask_layer(mask_x) + .permute(0, 2, 3, 1) + .contiguous() + .view(x.size(0), -1, self.mask_dim) + ) else: - mask = torch.zeros(x.size(0), bbox.size(1), self.mask_dim, device=bbox.device) + mask = torch.zeros( + x.size(0), bbox.size(1), self.mask_dim, device=bbox.device + ) if cfg.use_mask_scoring: - score = src.score_layer(x).permute(0, 2, 3, 1).contiguous().view(x.size(0), -1, 1) + score = ( + src.score_layer(x) + .permute(0, 2, 3, 1) + .contiguous() + .view(x.size(0), -1, 1) + ) if cfg.use_instance_coeff: - inst = src.inst_layer(x).permute(0, 2, 3, 1).contiguous().view(x.size(0), -1, cfg.num_instance_coeffs) + inst = ( + src.inst_layer(x) + .permute(0, 2, 3, 1) + .contiguous() + .view(x.size(0), -1, cfg.num_instance_coeffs) + ) # See box_utils.decode for an explanation of this if cfg.use_yolo_regressors: @@ -193,22 +268,38 @@ def forward(self, x): mask = cfg.mask_proto_coeff_activation(mask) if cfg.mask_proto_coeff_gate: - gate = src.gate_layer(x).permute(0, 2, 3, 1).contiguous().view(x.size(0), -1, self.mask_dim) + gate = ( + src.gate_layer(x) + .permute(0, 2, 3, 1) + .contiguous() + .view(x.size(0), -1, self.mask_dim) + ) mask = mask * torch.sigmoid(gate) - if cfg.mask_proto_split_prototypes_by_head and cfg.mask_type == mask_type.lincomb: - mask = F.pad(mask, (self.index * self.mask_dim, (self.num_heads - self.index - 1) * self.mask_dim), mode='constant', value=0) - + if ( + cfg.mask_proto_split_prototypes_by_head + and cfg.mask_type == mask_type.lincomb + ): + mask = F.pad( + mask, + ( + self.index * self.mask_dim, + (self.num_heads - self.index - 1) * self.mask_dim, + ), + mode="constant", + value=0, + ) + priors = self.make_priors(conv_h, conv_w, x.device) - preds = { 'loc': bbox, 'conf': conf, 'mask': mask, 'priors': priors } + preds = {"loc": bbox, "conf": conf, "mask": mask, "priors": priors} if cfg.use_mask_scoring: - preds['score'] = score + preds["score"] = score if cfg.use_instance_coeff: - preds['inst'] = inst - + preds["inst"] = inst + return preds def make_priors(self, conv_h, conv_w, device): @@ -216,7 +307,7 @@ def make_priors(self, conv_h, conv_w, device): global prior_cache size = (conv_h, conv_w) - with timer.env('makepriors'): + with timer.env("makepriors"): if self.last_img_size != (cfg._tmp_img_w, cfg._tmp_img_h): prior_data = [] @@ -225,7 +316,7 @@ def make_priors(self, conv_h, conv_w, device): # +0.5 because priors are in center-size notation x = (i + 0.5) / conv_w y = (j + 0.5) / conv_h - + for ars in self.aspect_ratios: for scale in self.scales: for ar in ars: @@ -238,14 +329,16 @@ def make_priors(self, conv_h, conv_w, device): else: w = scale * ar / conv_w h = scale / ar / conv_h - + # This is for backward compatability with a bug where I made everything square by accident if cfg.backbone.use_square_anchors: h = w prior_data += [x, y, w, h] - self.priors = torch.Tensor(prior_data, device=device).view(-1, 4).detach() + self.priors = ( + torch.Tensor(prior_data, device=device).view(-1, 4).detach() + ) self.priors.requires_grad = False self.last_img_size = (cfg._tmp_img_w, cfg._tmp_img_h) self.last_conv_size = (conv_w, conv_h) @@ -254,14 +347,15 @@ def make_priors(self, conv_h, conv_w, device): # This whole weird situation is so that DataParalell doesn't copy the priors each iteration if prior_cache[size] is None: prior_cache[size] = {} - + if device not in prior_cache[size]: prior_cache[size][device] = self.priors.to(device) self.priors = prior_cache[size][device] - + return self.priors + class FPN(ScriptModuleWrapper): """ Implements a general version of the FPN introduced in @@ -277,38 +371,64 @@ class FPN(ScriptModuleWrapper): - in_channels (list): For each conv layer you supply in the forward pass, how many features will it have? """ - __constants__ = ['interpolation_mode', 'num_downsample', 'use_conv_downsample', 'relu_pred_layers', - 'lat_layers', 'pred_layers', 'downsample_layers', 'relu_downsample_layers'] + + __constants__ = [ + "interpolation_mode", + "num_downsample", + "use_conv_downsample", + "relu_pred_layers", + "lat_layers", + "pred_layers", + "downsample_layers", + "relu_downsample_layers", + ] def __init__(self, in_channels): super().__init__() - self.lat_layers = nn.ModuleList([ - nn.Conv2d(x, cfg.fpn.num_features, kernel_size=1) - for x in reversed(in_channels) - ]) + self.lat_layers = nn.ModuleList( + [ + nn.Conv2d(x, cfg.fpn.num_features, kernel_size=1) + for x in reversed(in_channels) + ] + ) # This is here for backwards compatability padding = 1 if cfg.fpn.pad else 0 - self.pred_layers = nn.ModuleList([ - nn.Conv2d(cfg.fpn.num_features, cfg.fpn.num_features, kernel_size=3, padding=padding) - for _ in in_channels - ]) + self.pred_layers = nn.ModuleList( + [ + nn.Conv2d( + cfg.fpn.num_features, + cfg.fpn.num_features, + kernel_size=3, + padding=padding, + ) + for _ in in_channels + ] + ) if cfg.fpn.use_conv_downsample: - self.downsample_layers = nn.ModuleList([ - nn.Conv2d(cfg.fpn.num_features, cfg.fpn.num_features, kernel_size=3, padding=1, stride=2) - for _ in range(cfg.fpn.num_downsample) - ]) - - self.interpolation_mode = cfg.fpn.interpolation_mode - self.num_downsample = cfg.fpn.num_downsample - self.use_conv_downsample = cfg.fpn.use_conv_downsample + self.downsample_layers = nn.ModuleList( + [ + nn.Conv2d( + cfg.fpn.num_features, + cfg.fpn.num_features, + kernel_size=3, + padding=1, + stride=2, + ) + for _ in range(cfg.fpn.num_downsample) + ] + ) + + self.interpolation_mode = cfg.fpn.interpolation_mode + self.num_downsample = cfg.fpn.num_downsample + self.use_conv_downsample = cfg.fpn.use_conv_downsample self.relu_downsample_layers = cfg.fpn.relu_downsample_layers - self.relu_pred_layers = cfg.fpn.relu_pred_layers + self.relu_pred_layers = cfg.fpn.relu_pred_layers @script_method_wrapper - def forward(self, convouts:List[torch.Tensor]): + def forward(self, convouts: List[torch.Tensor]): """ Args: - convouts (list): A list of convouts for the corresponding layers in in_channels. @@ -329,11 +449,13 @@ def forward(self, convouts:List[torch.Tensor]): if j < len(convouts) - 1: _, _, h, w = convouts[j].size() - x = F.interpolate(x, size=(h, w), mode=self.interpolation_mode, align_corners=False) - + x = F.interpolate( + x, size=(h, w), mode=self.interpolation_mode, align_corners=False + ) + x = x + lat_layer(convouts[j]) out[j] = x - + # This janky second loop is here because TorchScript. j = len(convouts) for pred_layer in self.pred_layers: @@ -360,13 +482,15 @@ def forward(self, convouts:List[torch.Tensor]): return out -class FastMaskIoUNet(ScriptModuleWrapper): +class FastMaskIoUNet(ScriptModuleWrapper): def __init__(self): super().__init__() input_channels = 1 - last_layer = [(cfg.num_classes-1, 1, {})] - self.maskiou_net, _ = make_net(input_channels, cfg.maskiou_net + last_layer, include_last_relu=True) + last_layer = [(cfg.num_classes - 1, 1, {})] + self.maskiou_net, _ = make_net( + input_channels, cfg.maskiou_net + last_layer, include_last_relu=True + ) def forward(self, x): x = self.maskiou_net(x) @@ -375,7 +499,6 @@ def forward(self, x): return maskiou_p - class Yolact(nn.Module): """ @@ -406,7 +529,7 @@ def __init__(self): # Compute mask_dim here and add it back to the config. Make sure Yolact's constructor is called early! if cfg.mask_type == mask_type.direct: - cfg.mask_dim = cfg.mask_size**2 + cfg.mask_dim = cfg.mask_size ** 2 elif cfg.mask_type == mask_type.lincomb: if cfg.mask_proto_use_grid: self.grid = torch.Tensor(np.load(cfg.mask_proto_grid_file)) @@ -415,19 +538,23 @@ def __init__(self): self.num_grids = 0 self.proto_src = cfg.mask_proto_src - - if self.proto_src is None: in_channels = 3 - elif cfg.fpn is not None: in_channels = cfg.fpn.num_features - else: in_channels = self.backbone.channels[self.proto_src] + + if self.proto_src is None: + in_channels = 3 + elif cfg.fpn is not None: + in_channels = cfg.fpn.num_features + else: + in_channels = self.backbone.channels[self.proto_src] in_channels += self.num_grids # The include_last_relu=false here is because we might want to change it to another function - self.proto_net, cfg.mask_dim = make_net(in_channels, cfg.mask_proto_net, include_last_relu=False) + self.proto_net, cfg.mask_dim = make_net( + in_channels, cfg.mask_proto_net, include_last_relu=False + ) if cfg.mask_proto_bias: cfg.mask_dim += 1 - self.selected_layers = cfg.backbone.selected_layers src_channels = self.backbone.channels @@ -437,10 +564,11 @@ def __init__(self): if cfg.fpn is not None: # Some hacky rewiring to accomodate the FPN self.fpn = FPN([src_channels[i] for i in self.selected_layers]) - self.selected_layers = list(range(len(self.selected_layers) + cfg.fpn.num_downsample)) + self.selected_layers = list( + range(len(self.selected_layers) + cfg.fpn.num_downsample) + ) src_channels = [cfg.fpn.num_features] * len(self.selected_layers) - self.prediction_layers = nn.ModuleList() cfg.num_heads = len(self.selected_layers) @@ -450,11 +578,14 @@ def __init__(self): if cfg.share_prediction_module and idx > 0: parent = self.prediction_layers[0] - pred = PredictionModule(src_channels[layer_idx], src_channels[layer_idx], - aspect_ratios = cfg.backbone.pred_aspect_ratios[idx], - scales = cfg.backbone.pred_scales[idx], - parent = parent, - index = idx) + pred = PredictionModule( + src_channels[layer_idx], + src_channels[layer_idx], + aspect_ratios=cfg.backbone.pred_aspect_ratios[idx], + scales=cfg.backbone.pred_scales[idx], + parent=parent, + index=idx, + ) self.prediction_layers.append(pred) # Extra parameters for the extra losses @@ -462,30 +593,42 @@ def __init__(self): # This comes from the smallest layer selected # Also note that cfg.num_classes includes background self.class_existence_fc = nn.Linear(src_channels[-1], cfg.num_classes - 1) - + if cfg.use_semantic_segmentation_loss: - self.semantic_seg_conv = nn.Conv2d(src_channels[0], cfg.num_classes-1, kernel_size=1) + self.semantic_seg_conv = nn.Conv2d( + src_channels[0], cfg.num_classes - 1, kernel_size=1 + ) # For use in evaluation - self.detect = Detect(cfg.num_classes, bkg_label=0, top_k=cfg.nms_top_k, - conf_thresh=cfg.nms_conf_thresh, nms_thresh=cfg.nms_thresh) + self.detect = Detect( + cfg.num_classes, + bkg_label=0, + top_k=cfg.nms_top_k, + conf_thresh=cfg.nms_conf_thresh, + nms_thresh=cfg.nms_thresh, + ) def save_weights(self, path): """ Saves the model's weights using compression because the file sizes were getting too big. """ torch.save(self.state_dict(), path) - + def load_weights(self, path): """ Loads weights from a compressed save file. """ state_dict = torch.load(path) # For backward compatability, remove these (the new variable is called layers) for key in list(state_dict.keys()): - if key.startswith('backbone.layer') and not key.startswith('backbone.layers'): + if key.startswith("backbone.layer") and not key.startswith( + "backbone.layers" + ): del state_dict[key] - + # Also for backward compatibility with v1.0 weights, do this check - if key.startswith('fpn.downsample_layers.'): - if cfg.fpn is not None and int(key.split('.')[2]) >= cfg.fpn.num_downsample: + if key.startswith("fpn.downsample_layers."): + if ( + cfg.fpn is not None + and int(key.split(".")[2]) >= cfg.fpn.num_downsample + ): del state_dict[key] self.load_state_dict(state_dict) @@ -494,8 +637,8 @@ def init_weights(self, backbone_path): # Initialize the backbone with the pretrained weights. self.backbone.init_backbone(backbone_path) - conv_constants = getattr(nn.Conv2d(1, 1, 1), '__constants__') - + conv_constants = getattr(nn.Conv2d(1, 1, 1), "__constants__") + # Quick lambda to test if one list contains the other def all_in(x, y): for _x in x: @@ -510,23 +653,23 @@ def all_in(x, y): # Broke in 1.4 (see issue #292), where RecursiveScriptModule is the new star of the show. # Note that this might break with future pytorch updates, so let me know if it does is_script_conv = False - if 'Script' in type(module).__name__: + if "Script" in type(module).__name__: # 1.4 workaround: now there's an original_name member so just use that - if hasattr(module, 'original_name'): - is_script_conv = 'Conv' in module.original_name + if hasattr(module, "original_name"): + is_script_conv = "Conv" in module.original_name # 1.3 workaround: check if this has the same constants as a conv module else: - is_script_conv = ( - all_in(module.__dict__['_constants_set'], conv_constants) - and all_in(conv_constants, module.__dict__['_constants_set'])) - + is_script_conv = all_in( + module.__dict__["_constants_set"], conv_constants + ) and all_in(conv_constants, module.__dict__["_constants_set"]) + is_conv_layer = isinstance(module, nn.Conv2d) or is_script_conv if is_conv_layer and module not in self.backbone.backbone_modules: nn.init.xavier_uniform_(module.weight.data) if module.bias is not None: - if cfg.use_focal_loss and 'conf_layer' in name: + if cfg.use_focal_loss and "conf_layer" in name: if not cfg.use_sigmoid_focal_loss: # Initialize the last layer as in the focal loss paper. # Because we use softmax and not sigmoid, I had to derive an alternate expression @@ -538,14 +681,20 @@ def all_in(x, y): # For simplicity (and because we have a degree of freedom here), set z = 1. Then we have # x_0 = log((1 - pi) / pi) note: don't split up the log for numerical stability # x_i = -log(c) for all i > 0 - module.bias.data[0] = np.log((1 - cfg.focal_loss_init_pi) / cfg.focal_loss_init_pi) + module.bias.data[0] = np.log( + (1 - cfg.focal_loss_init_pi) / cfg.focal_loss_init_pi + ) module.bias.data[1:] = -np.log(module.bias.size(0) - 1) else: - module.bias.data[0] = -np.log(cfg.focal_loss_init_pi / (1 - cfg.focal_loss_init_pi)) - module.bias.data[1:] = -np.log((1 - cfg.focal_loss_init_pi) / cfg.focal_loss_init_pi) + module.bias.data[0] = -np.log( + cfg.focal_loss_init_pi / (1 - cfg.focal_loss_init_pi) + ) + module.bias.data[1:] = -np.log( + (1 - cfg.focal_loss_init_pi) / cfg.focal_loss_init_pi + ) else: module.bias.data.zero_() - + def train(self, mode=True): super().train(mode) @@ -560,27 +709,27 @@ def freeze_bn(self, enable=False): module.weight.requires_grad = enable module.bias.requires_grad = enable - + def forward(self, x): """ The input should be of size [batch_size, 3, img_h, img_w] """ _, _, img_h, img_w = x.size() cfg._tmp_img_h = img_h cfg._tmp_img_w = img_w - - with timer.env('backbone'): + + with timer.env("backbone"): outs = self.backbone(x) if cfg.fpn is not None: - with timer.env('fpn'): + with timer.env("fpn"): # Use backbone.selected_layers because we overwrote self.selected_layers outs = [outs[i] for i in cfg.backbone.selected_layers] outs = self.fpn(outs) proto_out = None if cfg.mask_type == mask_type.lincomb and cfg.eval_mask_branch: - with timer.env('proto'): + with timer.env("proto"): proto_x = x if self.proto_src is None else outs[self.proto_src] - + if self.num_grids > 0: grids = self.grid.repeat(proto_x.size(0), 1, 1, 1) proto_x = torch.cat([proto_x, grids], dim=1) @@ -594,7 +743,7 @@ def forward(self, x): if cfg.mask_proto_prototypes_as_features_no_grad: proto_downsampled = proto_out.detach() - + # Move the features last so the multiplication is easy proto_out = proto_out.permute(0, 2, 3, 1).contiguous() @@ -603,30 +752,40 @@ def forward(self, x): bias_shape[-1] = 1 proto_out = torch.cat([proto_out, torch.ones(*bias_shape)], -1) - - with timer.env('pred_heads'): - pred_outs = { 'loc': [], 'conf': [], 'mask': [], 'priors': [] } + with timer.env("pred_heads"): + pred_outs = {"loc": [], "conf": [], "mask": [], "priors": []} if cfg.use_mask_scoring: - pred_outs['score'] = [] + pred_outs["score"] = [] if cfg.use_instance_coeff: - pred_outs['inst'] = [] - + pred_outs["inst"] = [] + for idx, pred_layer in zip(self.selected_layers, self.prediction_layers): pred_x = outs[idx] - if cfg.mask_type == mask_type.lincomb and cfg.mask_proto_prototypes_as_features: + if ( + cfg.mask_type == mask_type.lincomb + and cfg.mask_proto_prototypes_as_features + ): # Scale the prototypes down to the current prediction layer's size and add it as inputs - proto_downsampled = F.interpolate(proto_downsampled, size=outs[idx].size()[2:], mode='bilinear', align_corners=False) + proto_downsampled = F.interpolate( + proto_downsampled, + size=outs[idx].size()[2:], + mode="bilinear", + align_corners=False, + ) pred_x = torch.cat([pred_x, proto_downsampled], dim=1) # A hack for the way dataparallel works - if cfg.share_prediction_module and pred_layer is not self.prediction_layers[0]: + if ( + cfg.share_prediction_module + and pred_layer is not self.prediction_layers[0] + ): pred_layer.parent = [self.prediction_layers[0]] p = pred_layer(pred_x) - + for k, v in p.items(): pred_outs[k].append(v) @@ -634,68 +793,74 @@ def forward(self, x): pred_outs[k] = torch.cat(v, -2) if proto_out is not None: - pred_outs['proto'] = proto_out + pred_outs["proto"] = proto_out if self.training: # For the extra loss functions if cfg.use_class_existence_loss: - pred_outs['classes'] = self.class_existence_fc(outs[-1].mean(dim=(2, 3))) + pred_outs["classes"] = self.class_existence_fc( + outs[-1].mean(dim=(2, 3)) + ) if cfg.use_semantic_segmentation_loss: - pred_outs['segm'] = self.semantic_seg_conv(outs[0]) + pred_outs["segm"] = self.semantic_seg_conv(outs[0]) return pred_outs else: if cfg.use_mask_scoring: - pred_outs['score'] = torch.sigmoid(pred_outs['score']) + pred_outs["score"] = torch.sigmoid(pred_outs["score"]) if cfg.use_focal_loss: if cfg.use_sigmoid_focal_loss: # Note: even though conf[0] exists, this mode doesn't train it so don't use it - pred_outs['conf'] = torch.sigmoid(pred_outs['conf']) + pred_outs["conf"] = torch.sigmoid(pred_outs["conf"]) if cfg.use_mask_scoring: - pred_outs['conf'] *= pred_outs['score'] + pred_outs["conf"] *= pred_outs["score"] elif cfg.use_objectness_score: # See focal_loss_sigmoid in multibox_loss.py for details - objectness = torch.sigmoid(pred_outs['conf'][:, :, 0]) - pred_outs['conf'][:, :, 1:] = objectness[:, :, None] * F.softmax(pred_outs['conf'][:, :, 1:], -1) - pred_outs['conf'][:, :, 0 ] = 1 - objectness + objectness = torch.sigmoid(pred_outs["conf"][:, :, 0]) + pred_outs["conf"][:, :, 1:] = objectness[:, :, None] * F.softmax( + pred_outs["conf"][:, :, 1:], -1 + ) + pred_outs["conf"][:, :, 0] = 1 - objectness else: - pred_outs['conf'] = F.softmax(pred_outs['conf'], -1) + pred_outs["conf"] = F.softmax(pred_outs["conf"], -1) else: if cfg.use_objectness_score: - objectness = torch.sigmoid(pred_outs['conf'][:, :, 0]) - - pred_outs['conf'][:, :, 1:] = (objectness > 0.10)[..., None] \ - * F.softmax(pred_outs['conf'][:, :, 1:], dim=-1) - - else: - pred_outs['conf'] = F.softmax(pred_outs['conf'], -1) + objectness = torch.sigmoid(pred_outs["conf"][:, :, 0]) - return self.detect(pred_outs, self) + pred_outs["conf"][:, :, 1:] = (objectness > 0.10)[ + ..., None + ] * F.softmax(pred_outs["conf"][:, :, 1:], dim=-1) + else: + pred_outs["conf"] = F.softmax(pred_outs["conf"], -1) + return self.detect(pred_outs, self) # Some testing code -if __name__ == '__main__': +if __name__ == "__main__": from utils.functions import init_console + init_console() # Use the first argument to set the config if you want import sys + if len(sys.argv) > 1: from data.config import set_cfg + set_cfg(sys.argv[1]) net = Yolact() net.train() - net.init_weights(backbone_path='weights/' + cfg.backbone.path) + net.init_weights(backbone_path="weights/" + cfg.backbone.path) # GPU net = net.cuda() - torch.set_default_tensor_type('torch.cuda.FloatTensor') + torch.set_default_tensor_type("torch.cuda.FloatTensor") x = torch.zeros((1, 3, cfg.max_size, cfg.max_size)) y = net(x) @@ -705,20 +870,23 @@ def forward(self, x): print() for k, a in y.items(): - print(k + ': ', a.size(), torch.sum(a)) + print(k + ": ", a.size(), torch.sum(a)) exit() - + net(x) # timer.disable('pass2') avg = MovingAverage() try: while True: timer.reset() - with timer.env('everything else'): + with timer.env("everything else"): net(x) avg.add(timer.total_time()) - print('\033[2J') # Moves console cursor to 0,0 + print("\033[2J") # Moves console cursor to 0,0 timer.print_stats() - print('Avg fps: %.2f\tAvg ms: %.2f ' % (1/avg.get_avg(), avg.get_avg()*1000)) + print( + "Avg fps: %.2f\tAvg ms: %.2f " + % (1 / avg.get_avg(), avg.get_avg() * 1000) + ) except KeyboardInterrupt: pass From 169ac9c0ce61529610c580d5ec066b7478d084c8 Mon Sep 17 00:00:00 2001 From: jasonkena Date: Mon, 24 Feb 2020 17:11:08 +0700 Subject: [PATCH 02/10] APEX Amp Support --- backbone.py | 45 +++++++++++++++++++++++++++------ data/config.py | 2 ++ external/DCNv2/dcn_v2.py | 54 +++++++++++++++++++++++++++++++++++++--- train.py | 16 ++++++++++-- 4 files changed, 105 insertions(+), 12 deletions(-) diff --git a/backbone.py b/backbone.py index 254168008..3996e364b 100644 --- a/backbone.py +++ b/backbone.py @@ -28,6 +28,7 @@ def __init__( norm_layer=nn.BatchNorm2d, dilation=1, use_dcn=False, + use_amp=False, ): super(Bottleneck, self).__init__() self.conv1 = nn.Conv2d( @@ -43,6 +44,7 @@ def __init__( padding=dilation, dilation=dilation, deformable_groups=1, + use_amp=use_amp, ) self.conv2.bias.data.zero_() self.conv2.conv_offset_mask.weight.data.zero_() @@ -100,6 +102,7 @@ def __init__( atrous_layers=[], block=Bottleneck, norm_layer=nn.BatchNorm2d, + use_amp=False, ): super().__init__() @@ -120,7 +123,12 @@ def __init__( self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self._make_layer( - block, 64, layers[0], dcn_layers=dcn_layers[0], dcn_interval=dcn_interval + block, + 64, + layers[0], + dcn_layers=dcn_layers[0], + dcn_interval=dcn_interval, + use_amp=use_amp, ) self._make_layer( block, @@ -129,6 +137,7 @@ def __init__( stride=2, dcn_layers=dcn_layers[1], dcn_interval=dcn_interval, + use_amp=use_amp, ) self._make_layer( block, @@ -137,6 +146,7 @@ def __init__( stride=2, dcn_layers=dcn_layers[2], dcn_interval=dcn_interval, + use_amp=use_amp, ) self._make_layer( block, @@ -145,6 +155,7 @@ def __init__( stride=2, dcn_layers=dcn_layers[3], dcn_interval=dcn_interval, + use_amp=use_amp, ) # This contains every module that should be initialized by loading in pretrained weights. @@ -154,7 +165,14 @@ def __init__( self.backbone_modules = [m for m in self.modules() if isinstance(m, nn.Conv2d)] def _make_layer( - self, block, planes, blocks, stride=1, dcn_layers=0, dcn_interval=1 + self, + block, + planes, + blocks, + stride=1, + dcn_layers=0, + dcn_interval=1, + use_amp=False, ): """ Here one layer means a string of n Bottleneck blocks. """ downsample = None @@ -189,6 +207,7 @@ def _make_layer( self.norm_layer, self.dilation, use_dcn=use_dcn, + use_amp=use_amp, ) ) self.inplanes = planes * block.expansion @@ -196,7 +215,11 @@ def _make_layer( use_dcn = ((i + dcn_layers) >= blocks) and (i % dcn_interval == 0) layers.append( block( - self.inplanes, planes, norm_layer=self.norm_layer, use_dcn=use_dcn + self.inplanes, + planes, + norm_layer=self.norm_layer, + use_dcn=use_dcn, + use_amp=use_amp, ) ) layer = nn.Sequential(*layers) @@ -236,10 +259,16 @@ def init_backbone(self, path): # Note: Using strict=False is berry scary. Triple check this. self.load_state_dict(state_dict, strict=False) - def add_layer(self, conv_channels=1024, downsample=2, depth=1, block=Bottleneck): + def add_layer( + self, conv_channels=1024, downsample=2, depth=1, block=Bottleneck, use_amp=False + ): """ Add a downsample layer to the backbone as per what SSD does. """ self._make_layer( - block, conv_channels // block.expansion, blocks=depth, stride=downsample + block, + conv_channels // block.expansion, + blocks=depth, + stride=downsample, + use_amp=use_amp, ) @@ -549,6 +578,8 @@ def construct_backbone(cfg): num_layers = max(cfg.selected_layers) + 1 while len(backbone.layers) < num_layers: - backbone.add_layer() - + if cfg.use_amp: + backbone.add_layer(cfg.use_amp) + else: + backbone.add_layer() return backbone diff --git a/data/config.py b/data/config.py index b46cbb841..e3d15adeb 100644 --- a/data/config.py +++ b/data/config.py @@ -768,6 +768,8 @@ def print(self): "rescore_mask": False, "rescore_bbox": False, "maskious_to_train": -1, + # Additional Settings + "use_amp": True, } ) diff --git a/external/DCNv2/dcn_v2.py b/external/DCNv2/dcn_v2.py index 885c8898f..eeabb354b 100644 --- a/external/DCNv2/dcn_v2.py +++ b/external/DCNv2/dcn_v2.py @@ -26,12 +26,21 @@ def forward( padding, dilation, deformable_groups, + use_amp ): ctx.stride = _pair(stride) ctx.padding = _pair(padding) ctx.dilation = _pair(dilation) ctx.kernel_size = _pair(weight.shape[2:4]) ctx.deformable_groups = deformable_groups + ctx.use_amp=use_amp + + if use_amp: + input=input.float() + bias=bias.float() + offset=bias.float() + mask=mask.float() + output = _backend.dcn_v2_forward( input, weight, @@ -49,7 +58,10 @@ def forward( ctx.deformable_groups, ) ctx.save_for_backward(input, offset, mask, weight, bias) - return output + if use_amp: + return output.half() + else: + return output @staticmethod @once_differentiable @@ -79,6 +91,13 @@ def backward(ctx, grad_output): ctx.deformable_groups, ) + if ctx.use_amp: + grad_input=grad_input.half() + grad_offset=grad_offset.half() + grad_mask=grad_mask.half() + grad_weight=grad_weight.half() + grad_bias=grad_bias.half() + return ( grad_input, grad_offset, @@ -105,6 +124,7 @@ def __init__( padding, dilation=1, deformable_groups=1, + use_amp ): super(DCNv2, self).__init__() self.in_channels = in_channels @@ -114,6 +134,7 @@ def __init__( self.padding = _pair(padding) self.dilation = _pair(dilation) self.deformable_groups = deformable_groups + self.use_amp=use_amp self.weight = nn.Parameter( torch.Tensor(out_channels, in_channels, *self.kernel_size) @@ -148,6 +169,7 @@ def forward(self, input, offset, mask): self.padding, self.dilation, self.deformable_groups, + use_amp=self.use_amp ) @@ -161,6 +183,7 @@ def __init__( padding, dilation=1, deformable_groups=1, + use_amp ): super(DCN, self).__init__( in_channels, @@ -170,8 +193,9 @@ def __init__( padding, dilation, deformable_groups, + use_amp ) - + self.use_amp=use_amp channels_ = ( self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1] ) @@ -204,6 +228,7 @@ def forward(self, input): self.padding, self.dilation, self.deformable_groups, + use_amp=self.use_amp ) @@ -222,6 +247,7 @@ def forward( part_size=None, sample_per_part=4, trans_std=0.0, + use_amp ): ctx.spatial_scale = spatial_scale ctx.no_trans = int(no_trans) @@ -231,6 +257,12 @@ def forward( ctx.part_size = pooled_size if part_size is None else part_size ctx.sample_per_part = sample_per_part ctx.trans_std = trans_std + ctx.use_amp=use_amp + + if use_amp: + input=input.float() + rois=rois.float() + offset=offset.float() output, output_count = _backend.dcn_v2_psroi_pooling_forward( input, @@ -246,7 +278,10 @@ def forward( ctx.trans_std, ) ctx.save_for_backward(input, rois, offset, output_count) - return output + if use_amp: + return output.half() + else: + return output @staticmethod @once_differentiable @@ -268,6 +303,10 @@ def backward(ctx, grad_output): ctx.trans_std, ) + if ctx.use_amp: + grad_input=grad_input.half() + grad_offset=grad_offset.half() + return ( grad_input, None, @@ -297,6 +336,7 @@ def __init__( part_size=None, sample_per_part=4, trans_std=0.0, + use_amp ): super(DCNv2Pooling, self).__init__() self.spatial_scale = spatial_scale @@ -307,6 +347,7 @@ def __init__( self.part_size = pooled_size if part_size is None else part_size self.sample_per_part = sample_per_part self.trans_std = trans_std + self.use_amp=use_amp def forward(self, input, rois, offset): assert input.shape[1] == self.output_dim @@ -324,6 +365,7 @@ def forward(self, input, rois, offset): self.part_size, self.sample_per_part, self.trans_std, + use_amp=self.use_amp ) @@ -339,6 +381,7 @@ def __init__( sample_per_part=4, trans_std=0.0, deform_fc_dim=1024, + use_amp ): super(DCNPooling, self).__init__( spatial_scale, @@ -349,7 +392,9 @@ def __init__( part_size, sample_per_part, trans_std, + use_amp=use_amp ) + self.use_amp=use_amp self.deform_fc_dim = deform_fc_dim @@ -386,6 +431,7 @@ def forward(self, input, rois): self.part_size, self.sample_per_part, self.trans_std, + use_amp=self.use_amp ) # build mask and offset @@ -409,6 +455,7 @@ def forward(self, input, rois): self.part_size, self.sample_per_part, self.trans_std, + use_amp=self.use_amp ) * mask ) @@ -425,4 +472,5 @@ def forward(self, input, rois): self.part_size, self.sample_per_part, self.trans_std, + use_amp=self.use_amp ) diff --git a/train.py b/train.py index 8a2e82a82..28e8bd418 100644 --- a/train.py +++ b/train.py @@ -24,6 +24,8 @@ # Oof import eval as eval_script +# APEX Automated Mix Precision for 16 bit computation + def str2bool(v): return v.lower() in ("yes", "true", "t", "1") @@ -330,9 +332,15 @@ def train(): ) exit(-1) - net = CustomDataParallel(NetLoss(net, criterion)) if args.cuda: net = net.cuda() + if cfg.use_amp: + from apex import amp + + if not args.cuda: + raise ValueError("amp must be used with CUDA") + net, optimizer = amp.initialize(net, optimizer, opt_level="O1") + net = CustomDataParallel(NetLoss(net, criterion)) # Initialize everything if not cfg.freeze_bn: @@ -437,7 +445,11 @@ def train(): # all_loss = sum([v.mean() for v in losses.values()]) # Backprop - loss.backward() # Do this to free up vram even if loss is not finite + if cfg.use_amp: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() # Do this to free up vram even if loss is not finite if torch.isfinite(loss).item(): optimizer.step() From 2a778e9cf6ad384dd241d68c794ce7c0f8ca2ecc Mon Sep 17 00:00:00 2001 From: jasonkena Date: Thu, 27 Feb 2020 09:45:52 +0700 Subject: [PATCH 03/10] Backup before merge with remote --- backbone.py | 43 +++++------------- data/coco.py | 3 +- data/config.py | 76 +++++++++++++++++++++++++++++++ external/DCNv2/dcn_v2.py | 80 ++++++++++++++++++--------------- layers/modules/multibox_loss.py | 11 +++-- yolact.py | 15 ++++--- 6 files changed, 148 insertions(+), 80 deletions(-) diff --git a/backbone.py b/backbone.py index 3996e364b..78c162df3 100644 --- a/backbone.py +++ b/backbone.py @@ -113,6 +113,7 @@ def __init__( self.norm_layer = norm_layer self.dilation = 1 self.atrous_layers = atrous_layers + self.use_amp = use_amp # From torchvision.models.resnet.Resnet self.inplanes = 64 @@ -123,12 +124,7 @@ def __init__( self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self._make_layer( - block, - 64, - layers[0], - dcn_layers=dcn_layers[0], - dcn_interval=dcn_interval, - use_amp=use_amp, + block, 64, layers[0], dcn_layers=dcn_layers[0], dcn_interval=dcn_interval, ) self._make_layer( block, @@ -137,7 +133,6 @@ def __init__( stride=2, dcn_layers=dcn_layers[1], dcn_interval=dcn_interval, - use_amp=use_amp, ) self._make_layer( block, @@ -146,7 +141,6 @@ def __init__( stride=2, dcn_layers=dcn_layers[2], dcn_interval=dcn_interval, - use_amp=use_amp, ) self._make_layer( block, @@ -155,7 +149,6 @@ def __init__( stride=2, dcn_layers=dcn_layers[3], dcn_interval=dcn_interval, - use_amp=use_amp, ) # This contains every module that should be initialized by loading in pretrained weights. @@ -165,14 +158,7 @@ def __init__( self.backbone_modules = [m for m in self.modules() if isinstance(m, nn.Conv2d)] def _make_layer( - self, - block, - planes, - blocks, - stride=1, - dcn_layers=0, - dcn_interval=1, - use_amp=False, + self, block, planes, blocks, stride=1, dcn_layers=0, dcn_interval=1, ): """ Here one layer means a string of n Bottleneck blocks. """ downsample = None @@ -207,7 +193,7 @@ def _make_layer( self.norm_layer, self.dilation, use_dcn=use_dcn, - use_amp=use_amp, + use_amp=self.use_amp, ) ) self.inplanes = planes * block.expansion @@ -219,7 +205,7 @@ def _make_layer( planes, norm_layer=self.norm_layer, use_dcn=use_dcn, - use_amp=use_amp, + use_amp=self.use_amp, ) ) layer = nn.Sequential(*layers) @@ -259,16 +245,10 @@ def init_backbone(self, path): # Note: Using strict=False is berry scary. Triple check this. self.load_state_dict(state_dict, strict=False) - def add_layer( - self, conv_channels=1024, downsample=2, depth=1, block=Bottleneck, use_amp=False - ): + def add_layer(self, conv_channels=1024, downsample=2, depth=1, block=Bottleneck): """ Add a downsample layer to the backbone as per what SSD does. """ self._make_layer( - block, - conv_channels // block.expansion, - blocks=depth, - stride=downsample, - use_amp=use_amp, + block, conv_channels // block.expansion, blocks=depth, stride=downsample, ) @@ -570,16 +550,13 @@ def add_layer(self, conv_channels=128, downsample=2): self.layers.append(layer) -def construct_backbone(cfg): +def construct_backbone(cfg, use_amp): """ Constructs a backbone given a backbone config object (see config.py). """ - backbone = cfg.type(*cfg.args) + backbone = cfg.type(*cfg.args, use_amp=use_amp) # Add downsampling layers until we reach the number we need num_layers = max(cfg.selected_layers) + 1 while len(backbone.layers) < num_layers: - if cfg.use_amp: - backbone.add_layer(cfg.use_amp) - else: - backbone.add_layer() + backbone.add_layer() return backbone diff --git a/data/coco.py b/data/coco.py index 75e5197c0..3be2ba00a 100644 --- a/data/coco.py +++ b/data/coco.py @@ -42,7 +42,8 @@ def __call__(self, target, width, height): bbox = obj["bbox"] label_idx = obj["category_id"] if label_idx >= 0: - label_idx = self.label_map[label_idx] - 1 + # TODO: remove this after TACO testing + label_idx = self.label_map[label_idx + 1] - 1 final_box = list( np.array([bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]]) / scale diff --git a/data/config.py b/data/config.py index e3d15adeb..6377d283f 100644 --- a/data/config.py +++ b/data/config.py @@ -325,6 +325,82 @@ def print(self): } ) +# TODO: Please remove this test dataset + +taco_dataset = dataset_base.copy( + { + "name": "TACO", + "train_images": "/home/rtx/arm/TACO/data", + "train_info": "/home/rtx/arm/TACO/data/annotations_0_train.json", + "valid_images": "/home/rtx/arm/TACO/data", + "valid_info": "/home/rtx/arm/TACO/data/annotations_0_val.json", + "has_gt": True, + "class_names": ( + "Aluminium foil", + "Battery", + "Aluminium blister pack", + "Carded blister pack", + "Other plastic bottle", + "Clear plastic bottle", + "Glass bottle", + "Plastic bottle cap", + "Metal bottle cap", + "Broken glass", + "Food Can", + "Aerosol", + "Drink can", + "Toilet tube", + "Other carton", + "Egg carton", + "Drink carton", + "Corrugated carton", + "Meal carton", + "Pizza box", + "Paper cup", + "Disposable plastic cup", + "Foam cup", + "Glass cup", + "Other plastic cup", + "Food waste", + "Glass jar", + "Plastic lid", + "Metal lid", + "Other plastic", + "Magazine paper", + "Tissues", + "Wrapping paper", + "Normal paper", + "Paper bag", + "Plastified paper bag", + "Plastic film", + "Six pack rings", + "Garbage bag", + "Other plastic wrapper", + "Single-use carrier bag", + "Polypropylene bag", + "Crisp packet", + "Spread tub", + "Tupperware", + "Disposable food container", + "Foam food container", + "Other plastic container", + "Plastic glooves", + "Plastic utensils", + "Pop tab", + "Rope & strings", + "Scrap metal", + "Shoe", + "Squeezable tube", + "Plastic straw", + "Paper straw", + "Styrofoam piece", + "Unlabeled litter", + "Cigarette", + ), + # "label_map": {i: i for i in range(1, 61)}, + } +) + # ----------------------- TRANSFORMS ----------------------- # diff --git a/external/DCNv2/dcn_v2.py b/external/DCNv2/dcn_v2.py index eeabb354b..142f424ff 100644 --- a/external/DCNv2/dcn_v2.py +++ b/external/DCNv2/dcn_v2.py @@ -26,20 +26,21 @@ def forward( padding, dilation, deformable_groups, - use_amp + use_amp, ): ctx.stride = _pair(stride) ctx.padding = _pair(padding) ctx.dilation = _pair(dilation) ctx.kernel_size = _pair(weight.shape[2:4]) ctx.deformable_groups = deformable_groups - ctx.use_amp=use_amp + ctx.use_amp = use_amp if use_amp: - input=input.float() - bias=bias.float() - offset=bias.float() - mask=mask.float() + input = input.float() + offset = bias.float() + mask = mask.float() + weight = weight.float() + bias = bias.float() output = _backend.dcn_v2_forward( input, @@ -60,13 +61,15 @@ def forward( ctx.save_for_backward(input, offset, mask, weight, bias) if use_amp: return output.half() - else: - return output + return output @staticmethod @once_differentiable def backward(ctx, grad_output): + print("in backward") input, offset, mask, weight, bias = ctx.saved_tensors + if ctx.use_amp: + grad_output = grad_output.float() ( grad_input, grad_offset, @@ -92,12 +95,11 @@ def backward(ctx, grad_output): ) if ctx.use_amp: - grad_input=grad_input.half() - grad_offset=grad_offset.half() - grad_mask=grad_mask.half() - grad_weight=grad_weight.half() - grad_bias=grad_bias.half() - + grad_input = grad_input.half() + grad_offset = grad_offset.half() + grad_mask = grad_mask.half() + grad_weight = grad_weight.half() + grad_bias = grad_bias.half() return ( grad_input, grad_offset, @@ -108,6 +110,7 @@ def backward(ctx, grad_output): None, None, None, + None, ) @@ -124,7 +127,7 @@ def __init__( padding, dilation=1, deformable_groups=1, - use_amp + use_amp=False, ): super(DCNv2, self).__init__() self.in_channels = in_channels @@ -134,7 +137,7 @@ def __init__( self.padding = _pair(padding) self.dilation = _pair(dilation) self.deformable_groups = deformable_groups - self.use_amp=use_amp + self.use_amp = use_amp self.weight = nn.Parameter( torch.Tensor(out_channels, in_channels, *self.kernel_size) @@ -169,8 +172,9 @@ def forward(self, input, offset, mask): self.padding, self.dilation, self.deformable_groups, - use_amp=self.use_amp + self.use_amp, ) + # add if amp here class DCN(DCNv2): @@ -183,7 +187,7 @@ def __init__( padding, dilation=1, deformable_groups=1, - use_amp + use_amp=False, ): super(DCN, self).__init__( in_channels, @@ -193,9 +197,9 @@ def __init__( padding, dilation, deformable_groups, - use_amp + use_amp, ) - self.use_amp=use_amp + self.use_amp = use_amp channels_ = ( self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1] ) @@ -228,7 +232,7 @@ def forward(self, input): self.padding, self.dilation, self.deformable_groups, - use_amp=self.use_amp + self.use_amp, ) @@ -247,7 +251,7 @@ def forward( part_size=None, sample_per_part=4, trans_std=0.0, - use_amp + use_amp=False, ): ctx.spatial_scale = spatial_scale ctx.no_trans = int(no_trans) @@ -257,12 +261,12 @@ def forward( ctx.part_size = pooled_size if part_size is None else part_size ctx.sample_per_part = sample_per_part ctx.trans_std = trans_std - ctx.use_amp=use_amp + ctx.use_amp = use_amp if use_amp: - input=input.float() - rois=rois.float() - offset=offset.float() + input = input.float() + rois = rois.float() + offset = offset.float() output, output_count = _backend.dcn_v2_psroi_pooling_forward( input, @@ -304,8 +308,8 @@ def backward(ctx, grad_output): ) if ctx.use_amp: - grad_input=grad_input.half() - grad_offset=grad_offset.half() + grad_input = grad_input.half() + grad_offset = grad_offset.half() return ( grad_input, @@ -319,6 +323,7 @@ def backward(ctx, grad_output): None, None, None, + None, ) @@ -336,7 +341,7 @@ def __init__( part_size=None, sample_per_part=4, trans_std=0.0, - use_amp + use_amp=False, ): super(DCNv2Pooling, self).__init__() self.spatial_scale = spatial_scale @@ -347,7 +352,7 @@ def __init__( self.part_size = pooled_size if part_size is None else part_size self.sample_per_part = sample_per_part self.trans_std = trans_std - self.use_amp=use_amp + self.use_amp = use_amp def forward(self, input, rois, offset): assert input.shape[1] == self.output_dim @@ -365,7 +370,7 @@ def forward(self, input, rois, offset): self.part_size, self.sample_per_part, self.trans_std, - use_amp=self.use_amp + self.use_amp, ) @@ -381,7 +386,7 @@ def __init__( sample_per_part=4, trans_std=0.0, deform_fc_dim=1024, - use_amp + use_amp=False, ): super(DCNPooling, self).__init__( spatial_scale, @@ -392,9 +397,9 @@ def __init__( part_size, sample_per_part, trans_std, - use_amp=use_amp + use_amp, ) - self.use_amp=use_amp + self.use_amp = use_amp self.deform_fc_dim = deform_fc_dim @@ -431,7 +436,7 @@ def forward(self, input, rois): self.part_size, self.sample_per_part, self.trans_std, - use_amp=self.use_amp + self.use_amp, ) # build mask and offset @@ -455,7 +460,7 @@ def forward(self, input, rois): self.part_size, self.sample_per_part, self.trans_std, - use_amp=self.use_amp + self.use_amp, ) * mask ) @@ -472,5 +477,6 @@ def forward(self, input, rois): self.part_size, self.sample_per_part, self.trans_std, - use_amp=self.use_amp + self.use_amp, ) + diff --git a/layers/modules/multibox_loss.py b/layers/modules/multibox_loss.py index e7e8ea88b..e822d8f62 100644 --- a/layers/modules/multibox_loss.py +++ b/layers/modules/multibox_loss.py @@ -75,7 +75,7 @@ def forward(self, net, predictions, targets, masks, num_crowds): num_crowds (list): Number of crowd annotations per batch. The crowd annotations should be the last num_crowds elements of targets and masks. - + * Only if mask_type == lincomb """ @@ -288,7 +288,10 @@ def semantic_segmentation_loss( mode=interpolation_mode, align_corners=False, ).squeeze(0) - downsampled_masks = downsampled_masks.gt(0.5).float() + if cfg.use_amp: + downsampled_masks = downsampled_masks.gt(0.5).half() + else: + downsampled_masks = downsampled_masks.gt(0.5).float() # Construct Semantic Segmentation segment_t = torch.zeros_like(cur_segment, requires_grad=False) @@ -298,8 +301,8 @@ def semantic_segmentation_loss( ) loss_s += F.binary_cross_entropy_with_logits( - cur_segment, segment_t, reduction="sum" - ) + cur_segment.float(), segment_t.float(), reduction="sum" + ).half() return loss_s / mask_h / mask_w * cfg.semantic_segmentation_alpha diff --git a/yolact.py b/yolact.py index fa6d73708..da0051ff3 100644 --- a/yolact.py +++ b/yolact.py @@ -26,6 +26,11 @@ if not use_jit: print("Multiple GPUs detected! Turning off JIT.") +# AMP does not support TorchScript https://github.com/NVIDIA/apex/issues/303 +if cfg.use_amp: + use_jit = False + print("Using AMP, JIT disabled") + ScriptModuleWrapper = torch.jit.ScriptModule if use_jit else nn.Module script_method_wrapper = torch.jit.script_method if use_jit else lambda fn, _rcn=None: fn @@ -505,10 +510,10 @@ class Yolact(nn.Module): ██╗ ██╗ ██████╗ ██╗ █████╗ ██████╗████████╗ ╚██╗ ██╔╝██╔═══██╗██║ ██╔══██╗██╔════╝╚══██╔══╝ - ╚████╔╝ ██║ ██║██║ ███████║██║ ██║ - ╚██╔╝ ██║ ██║██║ ██╔══██║██║ ██║ - ██║ ╚██████╔╝███████╗██║ ██║╚██████╗ ██║ - ╚═╝ ╚═════╝ ╚══════╝╚═╝ ╚═╝ ╚═════╝ ╚═╝ + ╚████╔╝ ██║ ██║██║ ███████║██║ ██║ + ╚██╔╝ ██║ ██║██║ ██╔══██║██║ ██║ + ██║ ╚██████╔╝███████╗██║ ██║╚██████╗ ██║ + ╚═╝ ╚═════╝ ╚══════╝╚═╝ ╚═╝ ╚═════╝ ╚═╝ You can set the arguments by changing them in the backbone config object in config.py. @@ -522,7 +527,7 @@ class Yolact(nn.Module): def __init__(self): super().__init__() - self.backbone = construct_backbone(cfg.backbone) + self.backbone = construct_backbone(cfg.backbone, cfg.use_amp) if cfg.freeze_bn: self.freeze_bn() From 834b7b05d5b1be7603ba1ca3a607b4137027816c Mon Sep 17 00:00:00 2001 From: jasonkena Date: Thu, 27 Feb 2020 15:26:26 +0700 Subject: [PATCH 04/10] Second backup --- backbone.py | 18 +++++++------- external/DCNv2/dcn_v2.py | 51 +++++++++++++++++++++++----------------- yolact.py | 3 ++- 3 files changed, 41 insertions(+), 31 deletions(-) diff --git a/backbone.py b/backbone.py index 78c162df3..7b30b29fa 100644 --- a/backbone.py +++ b/backbone.py @@ -28,7 +28,7 @@ def __init__( norm_layer=nn.BatchNorm2d, dilation=1, use_dcn=False, - use_amp=False, + # use_amp=False, ): super(Bottleneck, self).__init__() self.conv1 = nn.Conv2d( @@ -44,7 +44,7 @@ def __init__( padding=dilation, dilation=dilation, deformable_groups=1, - use_amp=use_amp, + # use_amp=use_amp, ) self.conv2.bias.data.zero_() self.conv2.conv_offset_mask.weight.data.zero_() @@ -102,7 +102,7 @@ def __init__( atrous_layers=[], block=Bottleneck, norm_layer=nn.BatchNorm2d, - use_amp=False, + # use_amp=False, ): super().__init__() @@ -113,7 +113,7 @@ def __init__( self.norm_layer = norm_layer self.dilation = 1 self.atrous_layers = atrous_layers - self.use_amp = use_amp + # self.use_amp = use_amp # From torchvision.models.resnet.Resnet self.inplanes = 64 @@ -193,7 +193,7 @@ def _make_layer( self.norm_layer, self.dilation, use_dcn=use_dcn, - use_amp=self.use_amp, + # use_amp=self.use_amp, ) ) self.inplanes = planes * block.expansion @@ -205,7 +205,7 @@ def _make_layer( planes, norm_layer=self.norm_layer, use_dcn=use_dcn, - use_amp=self.use_amp, + # use_amp=self.use_amp, ) ) layer = nn.Sequential(*layers) @@ -550,9 +550,11 @@ def add_layer(self, conv_channels=128, downsample=2): self.layers.append(layer) -def construct_backbone(cfg, use_amp): +# def construct_backbone(cfg, use_amp): +def construct_backbone(cfg): """ Constructs a backbone given a backbone config object (see config.py). """ - backbone = cfg.type(*cfg.args, use_amp=use_amp) + # backbone = cfg.type(*cfg.args, use_amp=use_amp) + backbone = cfg.type(*cfg.args) # Add downsampling layers until we reach the number we need num_layers = max(cfg.selected_layers) + 1 diff --git a/external/DCNv2/dcn_v2.py b/external/DCNv2/dcn_v2.py index 142f424ff..a490bdfca 100644 --- a/external/DCNv2/dcn_v2.py +++ b/external/DCNv2/dcn_v2.py @@ -26,14 +26,16 @@ def forward( padding, dilation, deformable_groups, - use_amp, + # use_amp, ): ctx.stride = _pair(stride) ctx.padding = _pair(padding) ctx.dilation = _pair(dilation) ctx.kernel_size = _pair(weight.shape[2:4]) ctx.deformable_groups = deformable_groups - ctx.use_amp = use_amp + # ctx.use_amp = use_amp + ctx.use_amp = True + use_amp = True if use_amp: input = input.float() @@ -110,7 +112,7 @@ def backward(ctx, grad_output): None, None, None, - None, + # None, ) @@ -127,7 +129,7 @@ def __init__( padding, dilation=1, deformable_groups=1, - use_amp=False, + # use_amp=False, ): super(DCNv2, self).__init__() self.in_channels = in_channels @@ -137,7 +139,8 @@ def __init__( self.padding = _pair(padding) self.dilation = _pair(dilation) self.deformable_groups = deformable_groups - self.use_amp = use_amp + # self.use_amp = use_amp + self.use_amp = True self.weight = nn.Parameter( torch.Tensor(out_channels, in_channels, *self.kernel_size) @@ -172,7 +175,7 @@ def forward(self, input, offset, mask): self.padding, self.dilation, self.deformable_groups, - self.use_amp, + # self.use_amp, ) # add if amp here @@ -187,7 +190,7 @@ def __init__( padding, dilation=1, deformable_groups=1, - use_amp=False, + # use_amp=False, ): super(DCN, self).__init__( in_channels, @@ -197,9 +200,10 @@ def __init__( padding, dilation, deformable_groups, - use_amp, + # use_amp, ) - self.use_amp = use_amp + # self.use_amp = use_amp + self.use_amp = True channels_ = ( self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1] ) @@ -232,7 +236,7 @@ def forward(self, input): self.padding, self.dilation, self.deformable_groups, - self.use_amp, + # self.use_amp, ) @@ -251,7 +255,7 @@ def forward( part_size=None, sample_per_part=4, trans_std=0.0, - use_amp=False, + # use_amp=False, ): ctx.spatial_scale = spatial_scale ctx.no_trans = int(no_trans) @@ -261,7 +265,8 @@ def forward( ctx.part_size = pooled_size if part_size is None else part_size ctx.sample_per_part = sample_per_part ctx.trans_std = trans_std - ctx.use_amp = use_amp + # ctx.use_amp = use_amp + ctx.use_amp = True if use_amp: input = input.float() @@ -323,7 +328,7 @@ def backward(ctx, grad_output): None, None, None, - None, + # None, ) @@ -341,7 +346,7 @@ def __init__( part_size=None, sample_per_part=4, trans_std=0.0, - use_amp=False, + # use_amp=False, ): super(DCNv2Pooling, self).__init__() self.spatial_scale = spatial_scale @@ -352,7 +357,8 @@ def __init__( self.part_size = pooled_size if part_size is None else part_size self.sample_per_part = sample_per_part self.trans_std = trans_std - self.use_amp = use_amp + # self.use_amp = use_amp + self.use_amp = True def forward(self, input, rois, offset): assert input.shape[1] == self.output_dim @@ -370,7 +376,7 @@ def forward(self, input, rois, offset): self.part_size, self.sample_per_part, self.trans_std, - self.use_amp, + # self.use_amp, ) @@ -386,7 +392,7 @@ def __init__( sample_per_part=4, trans_std=0.0, deform_fc_dim=1024, - use_amp=False, + # use_amp=False, ): super(DCNPooling, self).__init__( spatial_scale, @@ -397,9 +403,10 @@ def __init__( part_size, sample_per_part, trans_std, - use_amp, + # use_amp, ) - self.use_amp = use_amp + # self.use_amp = use_amp + self.use_amp = True self.deform_fc_dim = deform_fc_dim @@ -436,7 +443,7 @@ def forward(self, input, rois): self.part_size, self.sample_per_part, self.trans_std, - self.use_amp, + # self.use_amp, ) # build mask and offset @@ -460,7 +467,7 @@ def forward(self, input, rois): self.part_size, self.sample_per_part, self.trans_std, - self.use_amp, + # self.use_amp, ) * mask ) @@ -477,6 +484,6 @@ def forward(self, input, rois): self.part_size, self.sample_per_part, self.trans_std, - self.use_amp, + # self.use_amp, ) diff --git a/yolact.py b/yolact.py index da0051ff3..2fd0e4209 100644 --- a/yolact.py +++ b/yolact.py @@ -527,7 +527,8 @@ class Yolact(nn.Module): def __init__(self): super().__init__() - self.backbone = construct_backbone(cfg.backbone, cfg.use_amp) + # self.backbone = construct_backbone(cfg.backbone, cfg.use_amp) + self.backbone = construct_backbone(cfg.backbone) if cfg.freeze_bn: self.freeze_bn() From f6d5d491fb1ed7807cd0a5dfbcfd5b5e773b9ae8 Mon Sep 17 00:00:00 2001 From: jasonkena Date: Thu, 27 Feb 2020 15:31:51 +0700 Subject: [PATCH 05/10] Clean --- backbone.py | 20 +++------ data/coco.py | 5 +-- data/config.py | 78 --------------------------------- external/DCNv2/dcn_v2.py | 65 +-------------------------- layers/modules/multibox_loss.py | 11 ++--- train.py | 16 +------ yolact.py | 14 ++---- 7 files changed, 19 insertions(+), 190 deletions(-) diff --git a/backbone.py b/backbone.py index 7b30b29fa..254168008 100644 --- a/backbone.py +++ b/backbone.py @@ -28,7 +28,6 @@ def __init__( norm_layer=nn.BatchNorm2d, dilation=1, use_dcn=False, - # use_amp=False, ): super(Bottleneck, self).__init__() self.conv1 = nn.Conv2d( @@ -44,7 +43,6 @@ def __init__( padding=dilation, dilation=dilation, deformable_groups=1, - # use_amp=use_amp, ) self.conv2.bias.data.zero_() self.conv2.conv_offset_mask.weight.data.zero_() @@ -102,7 +100,6 @@ def __init__( atrous_layers=[], block=Bottleneck, norm_layer=nn.BatchNorm2d, - # use_amp=False, ): super().__init__() @@ -113,7 +110,6 @@ def __init__( self.norm_layer = norm_layer self.dilation = 1 self.atrous_layers = atrous_layers - # self.use_amp = use_amp # From torchvision.models.resnet.Resnet self.inplanes = 64 @@ -124,7 +120,7 @@ def __init__( self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self._make_layer( - block, 64, layers[0], dcn_layers=dcn_layers[0], dcn_interval=dcn_interval, + block, 64, layers[0], dcn_layers=dcn_layers[0], dcn_interval=dcn_interval ) self._make_layer( block, @@ -158,7 +154,7 @@ def __init__( self.backbone_modules = [m for m in self.modules() if isinstance(m, nn.Conv2d)] def _make_layer( - self, block, planes, blocks, stride=1, dcn_layers=0, dcn_interval=1, + self, block, planes, blocks, stride=1, dcn_layers=0, dcn_interval=1 ): """ Here one layer means a string of n Bottleneck blocks. """ downsample = None @@ -193,7 +189,6 @@ def _make_layer( self.norm_layer, self.dilation, use_dcn=use_dcn, - # use_amp=self.use_amp, ) ) self.inplanes = planes * block.expansion @@ -201,11 +196,7 @@ def _make_layer( use_dcn = ((i + dcn_layers) >= blocks) and (i % dcn_interval == 0) layers.append( block( - self.inplanes, - planes, - norm_layer=self.norm_layer, - use_dcn=use_dcn, - # use_amp=self.use_amp, + self.inplanes, planes, norm_layer=self.norm_layer, use_dcn=use_dcn ) ) layer = nn.Sequential(*layers) @@ -248,7 +239,7 @@ def init_backbone(self, path): def add_layer(self, conv_channels=1024, downsample=2, depth=1, block=Bottleneck): """ Add a downsample layer to the backbone as per what SSD does. """ self._make_layer( - block, conv_channels // block.expansion, blocks=depth, stride=downsample, + block, conv_channels // block.expansion, blocks=depth, stride=downsample ) @@ -550,10 +541,8 @@ def add_layer(self, conv_channels=128, downsample=2): self.layers.append(layer) -# def construct_backbone(cfg, use_amp): def construct_backbone(cfg): """ Constructs a backbone given a backbone config object (see config.py). """ - # backbone = cfg.type(*cfg.args, use_amp=use_amp) backbone = cfg.type(*cfg.args) # Add downsampling layers until we reach the number we need @@ -561,4 +550,5 @@ def construct_backbone(cfg): while len(backbone.layers) < num_layers: backbone.add_layer() + return backbone diff --git a/data/coco.py b/data/coco.py index 3be2ba00a..d0375b147 100644 --- a/data/coco.py +++ b/data/coco.py @@ -42,8 +42,7 @@ def __call__(self, target, width, height): bbox = obj["bbox"] label_idx = obj["category_id"] if label_idx >= 0: - # TODO: remove this after TACO testing - label_idx = self.label_map[label_idx + 1] - 1 + label_idx = self.label_map[label_idx] - 1 final_box = list( np.array([bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]]) / scale @@ -125,7 +124,7 @@ def pull_item(self, index): ann_ids = self.coco.getAnnIds(imgIds=img_id) # Target has {'segmentation', 'area', iscrowd', 'image_id', 'bbox', 'category_id'} - target = [x for x in self.coco.loadAnns(ann_ids) if x['image_id'] == img_id] + target = [x for x in self.coco.loadAnns(ann_ids) if x["image_id"] == img_id] else: target = [] diff --git a/data/config.py b/data/config.py index 6377d283f..b46cbb841 100644 --- a/data/config.py +++ b/data/config.py @@ -325,82 +325,6 @@ def print(self): } ) -# TODO: Please remove this test dataset - -taco_dataset = dataset_base.copy( - { - "name": "TACO", - "train_images": "/home/rtx/arm/TACO/data", - "train_info": "/home/rtx/arm/TACO/data/annotations_0_train.json", - "valid_images": "/home/rtx/arm/TACO/data", - "valid_info": "/home/rtx/arm/TACO/data/annotations_0_val.json", - "has_gt": True, - "class_names": ( - "Aluminium foil", - "Battery", - "Aluminium blister pack", - "Carded blister pack", - "Other plastic bottle", - "Clear plastic bottle", - "Glass bottle", - "Plastic bottle cap", - "Metal bottle cap", - "Broken glass", - "Food Can", - "Aerosol", - "Drink can", - "Toilet tube", - "Other carton", - "Egg carton", - "Drink carton", - "Corrugated carton", - "Meal carton", - "Pizza box", - "Paper cup", - "Disposable plastic cup", - "Foam cup", - "Glass cup", - "Other plastic cup", - "Food waste", - "Glass jar", - "Plastic lid", - "Metal lid", - "Other plastic", - "Magazine paper", - "Tissues", - "Wrapping paper", - "Normal paper", - "Paper bag", - "Plastified paper bag", - "Plastic film", - "Six pack rings", - "Garbage bag", - "Other plastic wrapper", - "Single-use carrier bag", - "Polypropylene bag", - "Crisp packet", - "Spread tub", - "Tupperware", - "Disposable food container", - "Foam food container", - "Other plastic container", - "Plastic glooves", - "Plastic utensils", - "Pop tab", - "Rope & strings", - "Scrap metal", - "Shoe", - "Squeezable tube", - "Plastic straw", - "Paper straw", - "Styrofoam piece", - "Unlabeled litter", - "Cigarette", - ), - # "label_map": {i: i for i in range(1, 61)}, - } -) - # ----------------------- TRANSFORMS ----------------------- # @@ -844,8 +768,6 @@ def print(self): "rescore_mask": False, "rescore_bbox": False, "maskious_to_train": -1, - # Additional Settings - "use_amp": True, } ) diff --git a/external/DCNv2/dcn_v2.py b/external/DCNv2/dcn_v2.py index a490bdfca..885c8898f 100644 --- a/external/DCNv2/dcn_v2.py +++ b/external/DCNv2/dcn_v2.py @@ -26,24 +26,12 @@ def forward( padding, dilation, deformable_groups, - # use_amp, ): ctx.stride = _pair(stride) ctx.padding = _pair(padding) ctx.dilation = _pair(dilation) ctx.kernel_size = _pair(weight.shape[2:4]) ctx.deformable_groups = deformable_groups - # ctx.use_amp = use_amp - ctx.use_amp = True - use_amp = True - - if use_amp: - input = input.float() - offset = bias.float() - mask = mask.float() - weight = weight.float() - bias = bias.float() - output = _backend.dcn_v2_forward( input, weight, @@ -61,17 +49,12 @@ def forward( ctx.deformable_groups, ) ctx.save_for_backward(input, offset, mask, weight, bias) - if use_amp: - return output.half() return output @staticmethod @once_differentiable def backward(ctx, grad_output): - print("in backward") input, offset, mask, weight, bias = ctx.saved_tensors - if ctx.use_amp: - grad_output = grad_output.float() ( grad_input, grad_offset, @@ -96,12 +79,6 @@ def backward(ctx, grad_output): ctx.deformable_groups, ) - if ctx.use_amp: - grad_input = grad_input.half() - grad_offset = grad_offset.half() - grad_mask = grad_mask.half() - grad_weight = grad_weight.half() - grad_bias = grad_bias.half() return ( grad_input, grad_offset, @@ -112,7 +89,6 @@ def backward(ctx, grad_output): None, None, None, - # None, ) @@ -129,7 +105,6 @@ def __init__( padding, dilation=1, deformable_groups=1, - # use_amp=False, ): super(DCNv2, self).__init__() self.in_channels = in_channels @@ -139,8 +114,6 @@ def __init__( self.padding = _pair(padding) self.dilation = _pair(dilation) self.deformable_groups = deformable_groups - # self.use_amp = use_amp - self.use_amp = True self.weight = nn.Parameter( torch.Tensor(out_channels, in_channels, *self.kernel_size) @@ -175,9 +148,7 @@ def forward(self, input, offset, mask): self.padding, self.dilation, self.deformable_groups, - # self.use_amp, ) - # add if amp here class DCN(DCNv2): @@ -190,7 +161,6 @@ def __init__( padding, dilation=1, deformable_groups=1, - # use_amp=False, ): super(DCN, self).__init__( in_channels, @@ -200,10 +170,8 @@ def __init__( padding, dilation, deformable_groups, - # use_amp, ) - # self.use_amp = use_amp - self.use_amp = True + channels_ = ( self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1] ) @@ -236,7 +204,6 @@ def forward(self, input): self.padding, self.dilation, self.deformable_groups, - # self.use_amp, ) @@ -255,7 +222,6 @@ def forward( part_size=None, sample_per_part=4, trans_std=0.0, - # use_amp=False, ): ctx.spatial_scale = spatial_scale ctx.no_trans = int(no_trans) @@ -265,13 +231,6 @@ def forward( ctx.part_size = pooled_size if part_size is None else part_size ctx.sample_per_part = sample_per_part ctx.trans_std = trans_std - # ctx.use_amp = use_amp - ctx.use_amp = True - - if use_amp: - input = input.float() - rois = rois.float() - offset = offset.float() output, output_count = _backend.dcn_v2_psroi_pooling_forward( input, @@ -287,10 +246,7 @@ def forward( ctx.trans_std, ) ctx.save_for_backward(input, rois, offset, output_count) - if use_amp: - return output.half() - else: - return output + return output @staticmethod @once_differentiable @@ -312,10 +268,6 @@ def backward(ctx, grad_output): ctx.trans_std, ) - if ctx.use_amp: - grad_input = grad_input.half() - grad_offset = grad_offset.half() - return ( grad_input, None, @@ -328,7 +280,6 @@ def backward(ctx, grad_output): None, None, None, - # None, ) @@ -346,7 +297,6 @@ def __init__( part_size=None, sample_per_part=4, trans_std=0.0, - # use_amp=False, ): super(DCNv2Pooling, self).__init__() self.spatial_scale = spatial_scale @@ -357,8 +307,6 @@ def __init__( self.part_size = pooled_size if part_size is None else part_size self.sample_per_part = sample_per_part self.trans_std = trans_std - # self.use_amp = use_amp - self.use_amp = True def forward(self, input, rois, offset): assert input.shape[1] == self.output_dim @@ -376,7 +324,6 @@ def forward(self, input, rois, offset): self.part_size, self.sample_per_part, self.trans_std, - # self.use_amp, ) @@ -392,7 +339,6 @@ def __init__( sample_per_part=4, trans_std=0.0, deform_fc_dim=1024, - # use_amp=False, ): super(DCNPooling, self).__init__( spatial_scale, @@ -403,10 +349,7 @@ def __init__( part_size, sample_per_part, trans_std, - # use_amp, ) - # self.use_amp = use_amp - self.use_amp = True self.deform_fc_dim = deform_fc_dim @@ -443,7 +386,6 @@ def forward(self, input, rois): self.part_size, self.sample_per_part, self.trans_std, - # self.use_amp, ) # build mask and offset @@ -467,7 +409,6 @@ def forward(self, input, rois): self.part_size, self.sample_per_part, self.trans_std, - # self.use_amp, ) * mask ) @@ -484,6 +425,4 @@ def forward(self, input, rois): self.part_size, self.sample_per_part, self.trans_std, - # self.use_amp, ) - diff --git a/layers/modules/multibox_loss.py b/layers/modules/multibox_loss.py index e822d8f62..e7e8ea88b 100644 --- a/layers/modules/multibox_loss.py +++ b/layers/modules/multibox_loss.py @@ -75,7 +75,7 @@ def forward(self, net, predictions, targets, masks, num_crowds): num_crowds (list): Number of crowd annotations per batch. The crowd annotations should be the last num_crowds elements of targets and masks. - + * Only if mask_type == lincomb """ @@ -288,10 +288,7 @@ def semantic_segmentation_loss( mode=interpolation_mode, align_corners=False, ).squeeze(0) - if cfg.use_amp: - downsampled_masks = downsampled_masks.gt(0.5).half() - else: - downsampled_masks = downsampled_masks.gt(0.5).float() + downsampled_masks = downsampled_masks.gt(0.5).float() # Construct Semantic Segmentation segment_t = torch.zeros_like(cur_segment, requires_grad=False) @@ -301,8 +298,8 @@ def semantic_segmentation_loss( ) loss_s += F.binary_cross_entropy_with_logits( - cur_segment.float(), segment_t.float(), reduction="sum" - ).half() + cur_segment, segment_t, reduction="sum" + ) return loss_s / mask_h / mask_w * cfg.semantic_segmentation_alpha diff --git a/train.py b/train.py index 28e8bd418..8a2e82a82 100644 --- a/train.py +++ b/train.py @@ -24,8 +24,6 @@ # Oof import eval as eval_script -# APEX Automated Mix Precision for 16 bit computation - def str2bool(v): return v.lower() in ("yes", "true", "t", "1") @@ -332,15 +330,9 @@ def train(): ) exit(-1) + net = CustomDataParallel(NetLoss(net, criterion)) if args.cuda: net = net.cuda() - if cfg.use_amp: - from apex import amp - - if not args.cuda: - raise ValueError("amp must be used with CUDA") - net, optimizer = amp.initialize(net, optimizer, opt_level="O1") - net = CustomDataParallel(NetLoss(net, criterion)) # Initialize everything if not cfg.freeze_bn: @@ -445,11 +437,7 @@ def train(): # all_loss = sum([v.mean() for v in losses.values()]) # Backprop - if cfg.use_amp: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() # Do this to free up vram even if loss is not finite + loss.backward() # Do this to free up vram even if loss is not finite if torch.isfinite(loss).item(): optimizer.step() diff --git a/yolact.py b/yolact.py index 2fd0e4209..fa6d73708 100644 --- a/yolact.py +++ b/yolact.py @@ -26,11 +26,6 @@ if not use_jit: print("Multiple GPUs detected! Turning off JIT.") -# AMP does not support TorchScript https://github.com/NVIDIA/apex/issues/303 -if cfg.use_amp: - use_jit = False - print("Using AMP, JIT disabled") - ScriptModuleWrapper = torch.jit.ScriptModule if use_jit else nn.Module script_method_wrapper = torch.jit.script_method if use_jit else lambda fn, _rcn=None: fn @@ -510,10 +505,10 @@ class Yolact(nn.Module): ██╗ ██╗ ██████╗ ██╗ █████╗ ██████╗████████╗ ╚██╗ ██╔╝██╔═══██╗██║ ██╔══██╗██╔════╝╚══██╔══╝ - ╚████╔╝ ██║ ██║██║ ███████║██║ ██║ - ╚██╔╝ ██║ ██║██║ ██╔══██║██║ ██║ - ██║ ╚██████╔╝███████╗██║ ██║╚██████╗ ██║ - ╚═╝ ╚═════╝ ╚══════╝╚═╝ ╚═╝ ╚═════╝ ╚═╝ + ╚████╔╝ ██║ ██║██║ ███████║██║ ██║ + ╚██╔╝ ██║ ██║██║ ██╔══██║██║ ██║ + ██║ ╚██████╔╝███████╗██║ ██║╚██████╗ ██║ + ╚═╝ ╚═════╝ ╚══════╝╚═╝ ╚═╝ ╚═════╝ ╚═╝ You can set the arguments by changing them in the backbone config object in config.py. @@ -527,7 +522,6 @@ class Yolact(nn.Module): def __init__(self): super().__init__() - # self.backbone = construct_backbone(cfg.backbone, cfg.use_amp) self.backbone = construct_backbone(cfg.backbone) if cfg.freeze_bn: From 2584d2301d5966005497f9355228b912631c8215 Mon Sep 17 00:00:00 2001 From: jasonkena Date: Thu, 27 Feb 2020 15:39:40 +0700 Subject: [PATCH 06/10] TACO Dataset --- data/coco.py | 2 +- data/config.py | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 1 deletion(-) diff --git a/data/coco.py b/data/coco.py index d0375b147..c8ffae63d 100644 --- a/data/coco.py +++ b/data/coco.py @@ -42,7 +42,7 @@ def __call__(self, target, width, height): bbox = obj["bbox"] label_idx = obj["category_id"] if label_idx >= 0: - label_idx = self.label_map[label_idx] - 1 + label_idx = self.label_map[label_idx + 1] - 1 final_box = list( np.array([bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]]) / scale diff --git a/data/config.py b/data/config.py index b46cbb841..d754e8768 100644 --- a/data/config.py +++ b/data/config.py @@ -325,6 +325,79 @@ def print(self): } ) +taco_dataset = dataset_base.copy( + { + "name": "TACO", + "train_images": "/home/rtx/arm/TACO/data", + "train_info": "/home/rtx/arm/TACO/data/annotations_0_train.json", + "valid_images": "/home/rtx/arm/TACO/data", + "valid_info": "/home/rtx/arm/TACO/data/annotations_0_val.json", + "has_gt": True, + "class_names": ( + "Aluminium foil", + "Battery", + "Aluminium blister pack", + "Carded blister pack", + "Other plastic bottle", + "Clear plastic bottle", + "Glass bottle", + "Plastic bottle cap", + "Metal bottle cap", + "Broken glass", + "Food Can", + "Aerosol", + "Drink can", + "Toilet tube", + "Other carton", + "Egg carton", + "Drink carton", + "Corrugated carton", + "Meal carton", + "Pizza box", + "Paper cup", + "Disposable plastic cup", + "Foam cup", + "Glass cup", + "Other plastic cup", + "Food waste", + "Glass jar", + "Plastic lid", + "Metal lid", + "Other plastic", + "Magazine paper", + "Tissues", + "Wrapping paper", + "Normal paper", + "Paper bag", + "Plastified paper bag", + "Plastic film", + "Six pack rings", + "Garbage bag", + "Other plastic wrapper", + "Single-use carrier bag", + "Polypropylene bag", + "Crisp packet", + "Spread tub", + "Tupperware", + "Disposable food container", + "Foam food container", + "Other plastic container", + "Plastic glooves", + "Plastic utensils", + "Pop tab", + "Rope & strings", + "Scrap metal", + "Shoe", + "Squeezable tube", + "Plastic straw", + "Paper straw", + "Styrofoam piece", + "Unlabeled litter", + "Cigarette", + ), + # "label_map": {i: i for i in range(1, 61)}, + } +) # ----------------------- TRANSFORMS ----------------------- # From 5db6856afc2c5e3e15a357604eec0a3eda5cabb4 Mon Sep 17 00:00:00 2001 From: jasonkena Date: Thu, 27 Feb 2020 17:28:34 +0700 Subject: [PATCH 07/10] AMP Working but multibox_loss broken --- backbone.py | 5 +++-- data/config.py | 2 ++ external/DCNv2/dcn_v2.py | 41 ++++++++++++++++++++++++++++++++++++++++ train.py | 16 ++++++++++++++-- yolact.py | 15 ++++++++++----- 5 files changed, 70 insertions(+), 9 deletions(-) diff --git a/backbone.py b/backbone.py index 254168008..4b5154b03 100644 --- a/backbone.py +++ b/backbone.py @@ -5,7 +5,7 @@ from collections import OrderedDict try: - from dcn_v2 import DCN + from dcn_v2 import DCN, set_amp except ImportError: def DCN(*args, **kwdargs): @@ -541,8 +541,9 @@ def add_layer(self, conv_channels=128, downsample=2): self.layers.append(layer) -def construct_backbone(cfg): +def construct_backbone(cfg, use_amp): """ Constructs a backbone given a backbone config object (see config.py). """ + set_amp(use_amp) backbone = cfg.type(*cfg.args) # Add downsampling layers until we reach the number we need diff --git a/data/config.py b/data/config.py index d754e8768..88a1c1141 100644 --- a/data/config.py +++ b/data/config.py @@ -841,6 +841,8 @@ def print(self): "rescore_mask": False, "rescore_bbox": False, "maskious_to_train": -1, + # Use 16-bit precision + "use_amp": True, } ) diff --git a/external/DCNv2/dcn_v2.py b/external/DCNv2/dcn_v2.py index 885c8898f..6f064c621 100644 --- a/external/DCNv2/dcn_v2.py +++ b/external/DCNv2/dcn_v2.py @@ -12,6 +12,13 @@ import _ext as _backend +use_amp = False + + +def set_amp(amp): + global use_amp + use_amp = amp + class _DCNv2(Function): @staticmethod @@ -27,6 +34,13 @@ def forward( dilation, deformable_groups, ): + if use_amp: + input = input.float() + offset = offset.float() + mask = mask.float() + weight = weight.float() + bias = bias.float() + ctx.stride = _pair(stride) ctx.padding = _pair(padding) ctx.dilation = _pair(dilation) @@ -49,11 +63,17 @@ def forward( ctx.deformable_groups, ) ctx.save_for_backward(input, offset, mask, weight, bias) + + if use_amp: + return output.half() return output @staticmethod @once_differentiable def backward(ctx, grad_output): + if use_amp: + grad_output = grad_output.float() + input, offset, mask, weight, bias = ctx.saved_tensors ( grad_input, @@ -79,6 +99,13 @@ def backward(ctx, grad_output): ctx.deformable_groups, ) + if use_amp: + grad_input = grad_input.half() + grad_offset = grad_offset.half() + grad_mask = grad_mask.half() + grad_weight = grad_weight.half() + grad_bias = grad_bias.half() + return ( grad_input, grad_offset, @@ -223,6 +250,11 @@ def forward( sample_per_part=4, trans_std=0.0, ): + if use_amp: + input = input.float() + roi = roi.float() + offset = offset.float() + ctx.spatial_scale = spatial_scale ctx.no_trans = int(no_trans) ctx.output_dim = output_dim @@ -246,11 +278,17 @@ def forward( ctx.trans_std, ) ctx.save_for_backward(input, rois, offset, output_count) + + if use_amp: + return output.half() return output @staticmethod @once_differentiable def backward(ctx, grad_output): + if use_amp: + grad_output = grad_output.float() + input, rois, offset, output_count = ctx.saved_tensors grad_input, grad_offset = _backend.dcn_v2_psroi_pooling_backward( grad_output, @@ -268,6 +306,9 @@ def backward(ctx, grad_output): ctx.trans_std, ) + if use_amp: + grad_input = grad_input.half() + grad_offset = grad_offset.half() return ( grad_input, None, diff --git a/train.py b/train.py index 8a2e82a82..4b3ab1220 100644 --- a/train.py +++ b/train.py @@ -330,9 +330,16 @@ def train(): ) exit(-1) - net = CustomDataParallel(NetLoss(net, criterion)) if args.cuda: net = net.cuda() + if cfg.use_amp: + from apex import amp + + if not args.cuda: + raise ValueError("amp must be used with CUDA") + net, optimizer = amp.initialize(net, optimizer, opt_level="O1") + + net = CustomDataParallel(NetLoss(net, criterion)) # Initialize everything if not cfg.freeze_bn: @@ -437,7 +444,12 @@ def train(): # all_loss = sum([v.mean() for v in losses.values()]) # Backprop - loss.backward() # Do this to free up vram even if loss is not finite + if cfg.use_amp: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() # Do this to free up vram even if loss is not finite + if torch.isfinite(loss).item(): optimizer.step() diff --git a/yolact.py b/yolact.py index fa6d73708..3a2302c43 100644 --- a/yolact.py +++ b/yolact.py @@ -26,6 +26,11 @@ if not use_jit: print("Multiple GPUs detected! Turning off JIT.") +# AMP does not support TorchScript https://github.com/NVIDIA/apex/issues/303 +elif cfg.use_amp: + use_jit = False + print("Using AMP, JIT disabled") + ScriptModuleWrapper = torch.jit.ScriptModule if use_jit else nn.Module script_method_wrapper = torch.jit.script_method if use_jit else lambda fn, _rcn=None: fn @@ -505,10 +510,10 @@ class Yolact(nn.Module): ██╗ ██╗ ██████╗ ██╗ █████╗ ██████╗████████╗ ╚██╗ ██╔╝██╔═══██╗██║ ██╔══██╗██╔════╝╚══██╔══╝ - ╚████╔╝ ██║ ██║██║ ███████║██║ ██║ - ╚██╔╝ ██║ ██║██║ ██╔══██║██║ ██║ - ██║ ╚██████╔╝███████╗██║ ██║╚██████╗ ██║ - ╚═╝ ╚═════╝ ╚══════╝╚═╝ ╚═╝ ╚═════╝ ╚═╝ + ╚████╔╝ ██║ ██║██║ ███████║██║ ██║ + ╚██╔╝ ██║ ██║██║ ██╔══██║██║ ██║ + ██║ ╚██████╔╝███████╗██║ ██║╚██████╗ ██║ + ╚═╝ ╚═════╝ ╚══════╝╚═╝ ╚═╝ ╚═════╝ ╚═╝ You can set the arguments by changing them in the backbone config object in config.py. @@ -522,7 +527,7 @@ class Yolact(nn.Module): def __init__(self): super().__init__() - self.backbone = construct_backbone(cfg.backbone) + self.backbone = construct_backbone(cfg.backbone, cfg.use_amp) if cfg.freeze_bn: self.freeze_bn() From 5e1be29653f90d55ff982cfe60d854a62411dea4 Mon Sep 17 00:00:00 2001 From: jasonkena Date: Thu, 27 Feb 2020 17:31:47 +0700 Subject: [PATCH 08/10] Everything AMP Working --- layers/modules/multibox_loss.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/layers/modules/multibox_loss.py b/layers/modules/multibox_loss.py index e7e8ea88b..359e66823 100644 --- a/layers/modules/multibox_loss.py +++ b/layers/modules/multibox_loss.py @@ -75,7 +75,7 @@ def forward(self, net, predictions, targets, masks, num_crowds): num_crowds (list): Number of crowd annotations per batch. The crowd annotations should be the last num_crowds elements of targets and masks. - + * Only if mask_type == lincomb """ @@ -288,7 +288,10 @@ def semantic_segmentation_loss( mode=interpolation_mode, align_corners=False, ).squeeze(0) - downsampled_masks = downsampled_masks.gt(0.5).float() + if cfg.use_amp: + downsampled_masks = downsampled_masks.gt(0.5).half() + else: + downsampled_masks = downsampled_masks.gt(0.5).float() # Construct Semantic Segmentation segment_t = torch.zeros_like(cur_segment, requires_grad=False) From 9dd42330d4cddcf8885f53e087052fa16be542a0 Mon Sep 17 00:00:00 2001 From: jasonkena Date: Thu, 27 Feb 2020 17:49:12 +0700 Subject: [PATCH 09/10] Updated README --- README.md | 15 ++++++---- data/coco.py | 2 +- data/config.py | 76 +------------------------------------------------- 3 files changed, 12 insertions(+), 81 deletions(-) diff --git a/README.md b/README.md index 641eac6e2..fdd987dec 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,10 @@ ``` ██╗ ██╗ ██████╗ ██╗ █████╗ ██████╗████████╗ ╚██╗ ██╔╝██╔═══██╗██║ ██╔══██╗██╔════╝╚══██╔══╝ - ╚████╔╝ ██║ ██║██║ ███████║██║ ██║ - ╚██╔╝ ██║ ██║██║ ██╔══██║██║ ██║ - ██║ ╚██████╔╝███████╗██║ ██║╚██████╗ ██║ - ╚═╝ ╚═════╝ ╚══════╝╚═╝ ╚═╝ ╚═════╝ ╚═╝ + ╚████╔╝ ██║ ██║██║ ███████║██║ ██║ + ╚██╔╝ ██║ ██║██║ ██╔══██║██║ ██║ + ██║ ╚██████╔╝███████╗██║ ██║╚██████╗ ██║ + ╚═╝ ╚═════╝ ╚══════╝╚═╝ ╚═╝ ╚═════╝ ╚═╝ ``` A simple, fully convolutional model for real-time instance segmentation. This is the code for our papers: @@ -44,7 +44,7 @@ Some examples from our YOLACT base model (33.5 fps on a Titan Xp and 29.8 mAP on ```Shell # Cython needs to be installed before pycocotools pip install cython - pip install opencv-python pillow pycocotools matplotlib + pip install opencv-python pillow pycocotools matplotlib ``` - If you'd like to train YOLACT, download the COCO dataset and the 2014/2017 annotations. Note that this script will take a while and dump 21gb of files into `./data/coco`. ```Shell @@ -169,6 +169,11 @@ YOLACT now supports multiple GPUs seamlessly during training: - If you have memory to spare you can increase the batch size further, but keep it a multiple of the number of GPUs you're using. - If you want to allocate the images per GPU specific for different GPUs, you can use `--batch_alloc=[alloc]` where [alloc] is a comma seprated list containing the number of images on each GPU. This must sum to `batch_size`. +## 16-bit Precision Support and Dynamic Loss-scaling +YOLACT now supports the use of [NVidia's Apex AMP](https://github.com/NVIDIA/apex), enabling computation in FP16, while maintaining the weights in FP32. The use of dynamic loss scaling also prevents the obnoxious `Moving average ignored a value of inf/nan` error. The only drawback is that [Apex does not support `torch.jit`](https://github.com/NVIDIA/apex/issues/308) + +To enable Apex AMP support, set `use_amp` in `data/config.py` to `True`. + ## Logging YOLACT now logs training and validation information by default. You can disable this with `--no_log`. A guide on how to visualize these logs is coming soon, but now you can look at `LogVizualizer` in `utils/logger.py` for help. diff --git a/data/coco.py b/data/coco.py index c8ffae63d..d0375b147 100644 --- a/data/coco.py +++ b/data/coco.py @@ -42,7 +42,7 @@ def __call__(self, target, width, height): bbox = obj["bbox"] label_idx = obj["category_id"] if label_idx >= 0: - label_idx = self.label_map[label_idx + 1] - 1 + label_idx = self.label_map[label_idx] - 1 final_box = list( np.array([bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]]) / scale diff --git a/data/config.py b/data/config.py index 88a1c1141..8bbcd8a1b 100644 --- a/data/config.py +++ b/data/config.py @@ -325,80 +325,6 @@ def print(self): } ) -taco_dataset = dataset_base.copy( - { - "name": "TACO", - "train_images": "/home/rtx/arm/TACO/data", - "train_info": "/home/rtx/arm/TACO/data/annotations_0_train.json", - "valid_images": "/home/rtx/arm/TACO/data", - "valid_info": "/home/rtx/arm/TACO/data/annotations_0_val.json", - "has_gt": True, - "class_names": ( - "Aluminium foil", - "Battery", - "Aluminium blister pack", - "Carded blister pack", - "Other plastic bottle", - "Clear plastic bottle", - "Glass bottle", - "Plastic bottle cap", - "Metal bottle cap", - "Broken glass", - "Food Can", - "Aerosol", - "Drink can", - "Toilet tube", - "Other carton", - "Egg carton", - "Drink carton", - "Corrugated carton", - "Meal carton", - "Pizza box", - "Paper cup", - "Disposable plastic cup", - "Foam cup", - "Glass cup", - "Other plastic cup", - "Food waste", - "Glass jar", - "Plastic lid", - "Metal lid", - "Other plastic", - "Magazine paper", - "Tissues", - "Wrapping paper", - "Normal paper", - "Paper bag", - "Plastified paper bag", - "Plastic film", - "Six pack rings", - "Garbage bag", - "Other plastic wrapper", - "Single-use carrier bag", - "Polypropylene bag", - "Crisp packet", - "Spread tub", - "Tupperware", - "Disposable food container", - "Foam food container", - "Other plastic container", - "Plastic glooves", - "Plastic utensils", - "Pop tab", - "Rope & strings", - "Scrap metal", - "Shoe", - "Squeezable tube", - "Plastic straw", - "Paper straw", - "Styrofoam piece", - "Unlabeled litter", - "Cigarette", - ), - # "label_map": {i: i for i in range(1, 61)}, - } -) - # ----------------------- TRANSFORMS ----------------------- # resnet_transform = Config( @@ -842,7 +768,7 @@ def print(self): "rescore_bbox": False, "maskious_to_train": -1, # Use 16-bit precision - "use_amp": True, + "use_amp": False, } ) From e1a949445dc0c57eb7c8f10470630faff0ce22e2 Mon Sep 17 00:00:00 2001 From: jasonkena Date: Thu, 27 Feb 2020 17:51:34 +0700 Subject: [PATCH 10/10] Fixed README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fdd987dec..bae11c7b3 100644 --- a/README.md +++ b/README.md @@ -170,7 +170,7 @@ YOLACT now supports multiple GPUs seamlessly during training: - If you want to allocate the images per GPU specific for different GPUs, you can use `--batch_alloc=[alloc]` where [alloc] is a comma seprated list containing the number of images on each GPU. This must sum to `batch_size`. ## 16-bit Precision Support and Dynamic Loss-scaling -YOLACT now supports the use of [NVidia's Apex AMP](https://github.com/NVIDIA/apex), enabling computation in FP16, while maintaining the weights in FP32. The use of dynamic loss scaling also prevents the obnoxious `Moving average ignored a value of inf/nan` error. The only drawback is that [Apex does not support `torch.jit`](https://github.com/NVIDIA/apex/issues/308) +YOLACT now supports the use of [NVidia's Apex AMP](https://github.com/NVIDIA/apex), enabling computation in FP16, while maintaining the weights in FP32. The use of dynamic loss scaling also prevents the `Moving average ignored a value of inf/nan` error. The only drawback is that [Apex does not support `torch.jit`](https://github.com/NVIDIA/apex/issues/303). To enable Apex AMP support, set `use_amp` in `data/config.py` to `True`.