Temporal CNN classifier model¶
Models¶
1D-Convolutional models f are functions f = f_1 \circ f_2 \circ \ldots \circ f_n made of n layers:
f_i(x) = \sigma (\pi( W_i * x + b_i)) , \quad \forall i \in [[1, n]]
where \sigma is an activation function like ReLU/PReLU/Leaky ReLU/tanh/...; \pi is an optional pooling function like average-pooling or max-pooling; and W_i and b_i are the weights and bias of the i-th layer.
1D-CNN over time with C independent channels¶
Input shape: (batch, duration=100, channels=66)
Output shape: (batch, n_classes=14)
class IndependentChannelsNet(torch.nn.Module): def __init__(self, n_channels=66, n_classes=14, fc_hidden_size=1936, dropout_probability=0.2): super(IndependentChannelsNet, self).__init__() self.n_channels = n_channels self.n_classes = n_classes self.fc_hidden_size = fc_hidden_size self.dropout_probability = dropout_probability # Layers ---------------------------------------------- self.all_conv_high = torch.nn.ModuleList([torch.nn.Sequential( torch.nn.Conv1d(in_channels=1, out_channels=8, kernel_size=7, padding=3), torch.nn.ReLU(), torch.nn.AvgPool1d(2), torch.nn.Conv1d(in_channels=8, out_channels=4, kernel_size=7, padding=3), torch.nn.ReLU(), torch.nn.AvgPool1d(2), torch.nn.Conv1d(in_channels=4, out_channels=4, kernel_size=7, padding=3), torch.nn.ReLU(), torch.nn.Dropout(p=self.dropout_probability), torch.nn.AvgPool1d(2) ) for joint in range(n_channels)]) self.all_conv_low = torch.nn.ModuleList([torch.nn.Sequential( torch.nn.Conv1d(in_channels=1, out_channels=8, kernel_size=3, padding=1), torch.nn.ReLU(), torch.nn.AvgPool1d(2), torch.nn.Conv1d(in_channels=8, out_channels=4, kernel_size=3, padding=1), torch.nn.ReLU(), torch.nn.AvgPool1d(2), torch.nn.Conv1d(in_channels=4, out_channels=4, kernel_size=3, padding=1), torch.nn.ReLU(), torch.nn.Dropout(p=self.dropout_probability), torch.nn.AvgPool1d(2) ) for joint in range(n_channels)]) self.all_residual = torch.nn.ModuleList([torch.nn.Sequential( torch.nn.AvgPool1d(2), torch.nn.AvgPool1d(2), torch.nn.AvgPool1d(2) ) for joint in range(n_channels)]) self.fc = torch.nn.Sequential( torch.nn.Linear(in_features=9 * n_channels * 12, out_features=self.fc_hidden_size), # <-- 12: depends of the sequences lengths (cf. below) torch.nn.ReLU(), torch.nn.Linear(in_features=self.fc_hidden_size, out_features=n_classes) ) # Initialization -------------------------------------- # Xavier init for module in itertools.chain(self.all_conv_high, self.all_conv_low, self.all_residual): for layer in module: if layer.__class__.__name__ == "Conv1d": torch.nn.init.xavier_uniform_(layer.weight, gain=torch.nn.init.calculate_gain('relu')) torch.nn.init.constant_(layer.bias, 0.1) for layer in self.fc: if layer.__class__.__name__ == "Linear": torch.nn.init.xavier_uniform_(layer.weight, gain=torch.nn.init.calculate_gain('relu')) torch.nn.init.constant_(layer.bias, 0.1) def forward(self, input): # Work on each channel separately all_features = [] for channel in range(0, self.n_channels): input_channel = input[:, :, channel] # Add a dummy (spatial) dimension for the time convolutions # Conv1D format : (batch_size, n_feature_maps, duration) input_channel = input_channel.unsqueeze(1) high = self.all_conv_high[channel](input_channel) low = self.all_conv_low[channel](input_channel) ap_residual = self.all_residual[channel](input_channel) # Time convolutions are concatenated along the feature maps axis output_channel = torch.cat([high, low, ap_residual], dim=1) all_features.append(output_channel) # Concatenate along the feature maps axis all_features = torch.cat(all_features, dim=1) # Flatten for the Linear layers all_features = all_features.view(-1, 9 * self.n_channels * 12) # <-- 12: depends of the initial sequence length (100). # If you have shorter/longer sequences, you probably do NOT even need to modify the modify the network architecture: # resampling your input gesture from T timesteps to 100 timesteps will (surprisingly) probably actually work as well! # Fully-Connected Layers output = self.fc(all_features) return output
1D-CNN over time with C=3 shared channels: for XYZ¶
Note: This model assumes that channels are in block mode: the channel dimension should feature all Xs, then all Ys, then all Zs.
Input shape: (batch, duration=100, channels=66)
Output shape: (batch, n_classes=14)
class XYZSharedChannelNet(torch.nn.Module): def __init__(self, n_channels=66, n_classes=14, fc_hidden_size=1936, dropout_probability=0.2): super(XYZSharedChannelNet, self).__init__() self.n_channels = n_channels self.n_classes = n_classes self.fc_hidden_size = fc_hidden_size self.dropout_probability = dropout_probability # Layers ---------------------------------------------- self.all_conv_high = torch.nn.ModuleList([torch.nn.Sequential( torch.nn.Conv1d(in_channels=1, out_channels=8, kernel_size=7, padding=3), torch.nn.ReLU(), torch.nn.AvgPool1d(2), torch.nn.Conv1d(in_channels=8, out_channels=4, kernel_size=7, padding=3), torch.nn.ReLU(), torch.nn.AvgPool1d(2), torch.nn.Conv1d(in_channels=4, out_channels=4, kernel_size=7, padding=3), torch.nn.ReLU(), torch.nn.Dropout(p=self.dropout_probability), torch.nn.AvgPool1d(2) ) for joint in range(3)]) self.all_conv_low = torch.nn.ModuleList([torch.nn.Sequential( torch.nn.Conv1d(in_channels=1, out_channels=8, kernel_size=3, padding=1), torch.nn.ReLU(), torch.nn.AvgPool1d(2), torch.nn.Conv1d(in_channels=8, out_channels=4, kernel_size=3, padding=1), torch.nn.ReLU(), torch.nn.AvgPool1d(2), torch.nn.Conv1d(in_channels=4, out_channels=4, kernel_size=3, padding=1), torch.nn.ReLU(), torch.nn.Dropout(p=self.dropout_probability), torch.nn.AvgPool1d(2) ) for joint in range(3)]) self.all_residual = torch.nn.ModuleList([torch.nn.Sequential( torch.nn.AvgPool1d(2), torch.nn.AvgPool1d(2), torch.nn.AvgPool1d(2) ) for joint in range(3)]) self.fc = torch.nn.Sequential( torch.nn.Linear(in_features=9 * n_channels * 12, out_features=self.fc_hidden_size), # <-- 12: depends of the sequences lengths (cf. below) torch.nn.ReLU(), torch.nn.Linear(in_features=self.fc_hidden_size, out_features=n_classes) ) # Initialization -------------------------------------- # Xavier init for module in itertools.chain(self.all_conv_high, self.all_conv_low, self.all_residual): for layer in module: if layer.__class__.__name__ == "Conv1d": torch.nn.init.xavier_uniform_(layer.weight, gain=torch.nn.init.calculate_gain('relu')) torch.nn.init.constant_(layer.bias, 0.1) for layer in self.fc: if layer.__class__.__name__ == "Linear": torch.nn.init.xavier_uniform_(layer.weight, gain=torch.nn.init.calculate_gain('relu')) torch.nn.init.constant_(layer.bias, 0.1) def forward(self, input): # Work on each channel separately all_features = [] for channel in range(0, self.n_channels): input_channel = input[:, :, channel] if channel < int(1 * self.n_channels): idx = 0 elif channel < int(2 * self.n_channels): idx = 1 elif channel < int(3 * self.n_channels): idx = 2 # Add a dummy (spatial) dimension for the time convolutions # Conv1D format : (batch_size, n_feature_maps, duration) input_channel = input_channel.unsqueeze(1) high = self.all_conv_high[idx](input_channel) low = self.all_conv_low[idx](input_channel) ap_residual = self.all_residual[idx](input_channel) # Time convolutions are concatenated along the feature maps axis output_channel = torch.cat([high, low, ap_residual], dim=1) all_features.append(output_channel) # Concatenate along the feature maps axis all_features = torch.cat(all_features, dim=1) # Flatten for the Linear layers all_features = all_features.view(-1, 9 * self.n_channels * 12) # <-- 12: depends of the initial sequence length (100). # If you have shorter/longer sequences, you probably do NOT even need to modify the modify the network architecture: # resampling your input gesture from T timesteps to 100 timesteps will (surprisingly) probably actually work as well! # Fully-Connected Layers output = self.fc(all_features) return output
1D-CNN over time with C=1 shared channel: for all¶
Input shape: (batch, duration=100, channels=66)
Output shape: (batch, n_classes=14)
class OneSharedChannelNet(torch.nn.Module): def __init__(self, n_channels=66, n_classes=14, fc_hidden_size=1936, dropout_probability=0.2): super(OneSharedChannelNet, self).__init__() self.n_channels = n_channels self.n_classes = n_classes self.fc_hidden_size = fc_hidden_size self.dropout_probability = dropout_probability # Layers ---------------------------------------------- self.all_conv_high = torch.nn.ModuleList([torch.nn.Sequential( torch.nn.Conv1d(in_channels=1, out_channels=8, kernel_size=7, padding=3), torch.nn.ReLU(), torch.nn.AvgPool1d(2), torch.nn.Conv1d(in_channels=8, out_channels=4, kernel_size=7, padding=3), torch.nn.ReLU(), torch.nn.AvgPool1d(2), torch.nn.Conv1d(in_channels=4, out_channels=4, kernel_size=7, padding=3), torch.nn.ReLU(), torch.nn.Dropout(p=self.dropout_probability), torch.nn.AvgPool1d(2) ) for joint in range(1)]) self.all_conv_low = torch.nn.ModuleList([torch.nn.Sequential( torch.nn.Conv1d(in_channels=1, out_channels=8, kernel_size=3, padding=1), torch.nn.ReLU(), torch.nn.AvgPool1d(2), torch.nn.Conv1d(in_channels=8, out_channels=4, kernel_size=3, padding=1), torch.nn.ReLU(), torch.nn.AvgPool1d(2), torch.nn.Conv1d(in_channels=4, out_channels=4, kernel_size=3, padding=1), torch.nn.ReLU(), torch.nn.Dropout(p=self.dropout_probability), torch.nn.AvgPool1d(2) ) for joint in range(1)]) self.all_residual = torch.nn.ModuleList([torch.nn.Sequential( torch.nn.AvgPool1d(2), torch.nn.AvgPool1d(2), torch.nn.AvgPool1d(2) ) for joint in range(1)]) self.fc = torch.nn.Sequential( torch.nn.Linear(in_features=9 * n_channels * 12, out_features=self.fc_hidden_size), # <-- 12: depends of the sequences lengths (cf. below) torch.nn.ReLU(), torch.nn.Linear(in_features=self.fc_hidden_size, out_features=n_classes) ) # Initialization -------------------------------------- # Xavier init for module in itertools.chain(self.all_conv_high, self.all_conv_low, self.all_residual): for layer in module: if layer.__class__.__name__ == "Conv1d": torch.nn.init.xavier_uniform_(layer.weight, gain=torch.nn.init.calculate_gain('relu')) torch.nn.init.constant_(layer.bias, 0.1) for layer in self.fc: if layer.__class__.__name__ == "Linear": torch.nn.init.xavier_uniform_(layer.weight, gain=torch.nn.init.calculate_gain('relu')) torch.nn.init.constant_(layer.bias, 0.1) def forward(self, input): # Work on each channel separately all_features = [] for channel in range(0, self.n_channels): input_channel = input[:, :, channel] # Add a dummy (spatial) dimension for the time convolutions # Conv1D format : (batch_size, n_feature_maps, duration) input_channel = input_channel.unsqueeze(1) high = self.all_conv_high[0](input_channel) low = self.all_conv_low[0](input_channel) ap_residual = self.all_residual[0](input_channel) # Time convolutions are concatenated along the feature maps axis output_channel = torch.cat([high, low, ap_residual], dim=1) all_features.append(output_channel) # Concatenate along the feature maps axis all_features = torch.cat(all_features, dim=1) # Flatten for the Linear layers all_features = all_features.view(-1, 9 * self.n_channels * 12) # <-- 12: depends of the initial sequence length (100). # If you have shorter/longer sequences, you probably do NOT even need to modify the modify the network architecture: # resampling your input gesture from T timesteps to 100 timesteps will (surprisingly) probably actually work as well! # Fully-Connected Layers output = self.fc(all_features) return output