Temporal CNN classifier model

Models

1D-Convolutional models f are functions f = f_1 \circ f_2 \circ \ldots \circ f_n made of n layers:

f_i(x) = \sigma (\pi( W_i * x + b_i)) , \quad \forall i \in [[1, n]]

where \sigma is an activation function like ReLU/PReLU/Leaky ReLU/tanh/...; \pi is an optional pooling function like average-pooling or max-pooling; and W_i and b_i are the weights and bias of the i-th layer.

1D-CNN over time with C independent channels

Input shape: (batch, duration=100, channels=66)

Output shape: (batch, n_classes=14)

class IndependentChannelsNet(torch.nn.Module):
    def __init__(self, n_channels=66, n_classes=14, fc_hidden_size=1936, dropout_probability=0.2):
        super(IndependentChannelsNet, self).__init__()
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.fc_hidden_size = fc_hidden_size
        self.dropout_probability = dropout_probability
        # Layers ----------------------------------------------
        self.all_conv_high = torch.nn.ModuleList([torch.nn.Sequential(
            torch.nn.Conv1d(in_channels=1, out_channels=8, kernel_size=7, padding=3),
            torch.nn.ReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=8, out_channels=4, kernel_size=7, padding=3),
            torch.nn.ReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=4, out_channels=4, kernel_size=7, padding=3),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=self.dropout_probability),
            torch.nn.AvgPool1d(2)
        ) for joint in range(n_channels)])
        self.all_conv_low = torch.nn.ModuleList([torch.nn.Sequential(
            torch.nn.Conv1d(in_channels=1, out_channels=8, kernel_size=3, padding=1),
            torch.nn.ReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=8, out_channels=4, kernel_size=3, padding=1),
            torch.nn.ReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=4, out_channels=4, kernel_size=3, padding=1),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=self.dropout_probability),
            torch.nn.AvgPool1d(2)
        ) for joint in range(n_channels)])
        self.all_residual = torch.nn.ModuleList([torch.nn.Sequential(
            torch.nn.AvgPool1d(2),
            torch.nn.AvgPool1d(2),
            torch.nn.AvgPool1d(2)
        ) for joint in range(n_channels)])
        self.fc = torch.nn.Sequential(
            torch.nn.Linear(in_features=9 * n_channels * 12, out_features=self.fc_hidden_size),  # <-- 12: depends of the sequences lengths (cf. below)
            torch.nn.ReLU(),
            torch.nn.Linear(in_features=self.fc_hidden_size, out_features=n_classes)
        )
        # Initialization --------------------------------------
        # Xavier init
        for module in itertools.chain(self.all_conv_high, self.all_conv_low, self.all_residual):
            for layer in module:
                if layer.__class__.__name__ == "Conv1d":
                    torch.nn.init.xavier_uniform_(layer.weight, gain=torch.nn.init.calculate_gain('relu'))
                    torch.nn.init.constant_(layer.bias, 0.1)

        for layer in self.fc:
            if layer.__class__.__name__ == "Linear":
                torch.nn.init.xavier_uniform_(layer.weight, gain=torch.nn.init.calculate_gain('relu'))
                torch.nn.init.constant_(layer.bias, 0.1)
    def forward(self, input):
        # Work on each channel separately
        all_features = []
        for channel in range(0, self.n_channels):
            input_channel = input[:, :, channel]
            # Add a dummy (spatial) dimension for the time convolutions
            # Conv1D format : (batch_size, n_feature_maps, duration)
            input_channel = input_channel.unsqueeze(1)
            high = self.all_conv_high[channel](input_channel)
            low = self.all_conv_low[channel](input_channel)
            ap_residual = self.all_residual[channel](input_channel)
            # Time convolutions are concatenated along the feature maps axis
            output_channel = torch.cat([high, low, ap_residual], dim=1)
            all_features.append(output_channel)
        # Concatenate along the feature maps axis
        all_features = torch.cat(all_features, dim=1)
        # Flatten for the Linear layers
        all_features = all_features.view(-1, 9 * self.n_channels * 12)  # <-- 12: depends of the initial sequence length (100).
        # If you have shorter/longer sequences, you probably do NOT even need to modify the modify the network architecture:
        # resampling your input gesture from T timesteps to 100 timesteps will (surprisingly) probably actually work as well!
        # Fully-Connected Layers
        output = self.fc(all_features)
        return output

1D-CNN over time with C=3 shared channels: for XYZ

Note: This model assumes that channels are in block mode: the channel dimension should feature all Xs, then all Ys, then all Zs.

Input shape: (batch, duration=100, channels=66)

Output shape: (batch, n_classes=14)

class XYZSharedChannelNet(torch.nn.Module):
    def __init__(self, n_channels=66, n_classes=14, fc_hidden_size=1936, dropout_probability=0.2):
        super(XYZSharedChannelNet, self).__init__()
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.fc_hidden_size = fc_hidden_size
        self.dropout_probability = dropout_probability
        # Layers ----------------------------------------------
        self.all_conv_high = torch.nn.ModuleList([torch.nn.Sequential(
            torch.nn.Conv1d(in_channels=1, out_channels=8, kernel_size=7, padding=3),
            torch.nn.ReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=8, out_channels=4, kernel_size=7, padding=3),
            torch.nn.ReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=4, out_channels=4, kernel_size=7, padding=3),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=self.dropout_probability),
            torch.nn.AvgPool1d(2)
        ) for joint in range(3)])
        self.all_conv_low = torch.nn.ModuleList([torch.nn.Sequential(
            torch.nn.Conv1d(in_channels=1, out_channels=8, kernel_size=3, padding=1),
            torch.nn.ReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=8, out_channels=4, kernel_size=3, padding=1),
            torch.nn.ReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=4, out_channels=4, kernel_size=3, padding=1),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=self.dropout_probability),
            torch.nn.AvgPool1d(2)
        ) for joint in range(3)])
        self.all_residual = torch.nn.ModuleList([torch.nn.Sequential(
            torch.nn.AvgPool1d(2),
            torch.nn.AvgPool1d(2),
            torch.nn.AvgPool1d(2)
        ) for joint in range(3)])
        self.fc = torch.nn.Sequential(
            torch.nn.Linear(in_features=9 * n_channels * 12, out_features=self.fc_hidden_size),  # <-- 12: depends of the sequences lengths (cf. below)
            torch.nn.ReLU(),
            torch.nn.Linear(in_features=self.fc_hidden_size, out_features=n_classes)
        )
        # Initialization --------------------------------------
        # Xavier init
        for module in itertools.chain(self.all_conv_high, self.all_conv_low, self.all_residual):
            for layer in module:
                if layer.__class__.__name__ == "Conv1d":
                    torch.nn.init.xavier_uniform_(layer.weight, gain=torch.nn.init.calculate_gain('relu'))
                    torch.nn.init.constant_(layer.bias, 0.1)

        for layer in self.fc:
            if layer.__class__.__name__ == "Linear":
                torch.nn.init.xavier_uniform_(layer.weight, gain=torch.nn.init.calculate_gain('relu'))
                torch.nn.init.constant_(layer.bias, 0.1)
    def forward(self, input):
        # Work on each channel separately
        all_features = []
        for channel in range(0, self.n_channels):
            input_channel = input[:, :, channel]

            if channel < int(1 * self.n_channels):
                idx = 0
            elif channel < int(2 * self.n_channels):
                idx = 1
            elif channel < int(3 * self.n_channels):
                idx = 2
            # Add a dummy (spatial) dimension for the time convolutions
            # Conv1D format : (batch_size, n_feature_maps, duration)
            input_channel = input_channel.unsqueeze(1)
            high = self.all_conv_high[idx](input_channel)
            low = self.all_conv_low[idx](input_channel)
            ap_residual = self.all_residual[idx](input_channel)
            # Time convolutions are concatenated along the feature maps axis
            output_channel = torch.cat([high, low, ap_residual], dim=1)
            all_features.append(output_channel)
        # Concatenate along the feature maps axis
        all_features = torch.cat(all_features, dim=1)
        # Flatten for the Linear layers
        all_features = all_features.view(-1, 9 * self.n_channels * 12)  # <-- 12: depends of the initial sequence length (100).
        # If you have shorter/longer sequences, you probably do NOT even need to modify the modify the network architecture:
        # resampling your input gesture from T timesteps to 100 timesteps will (surprisingly) probably actually work as well!
        # Fully-Connected Layers
        output = self.fc(all_features)
        return output

1D-CNN over time with C=1 shared channel: for all

Input shape: (batch, duration=100, channels=66)

Output shape: (batch, n_classes=14)

class OneSharedChannelNet(torch.nn.Module):
    def __init__(self, n_channels=66, n_classes=14, fc_hidden_size=1936, dropout_probability=0.2):
        super(OneSharedChannelNet, self).__init__()
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.fc_hidden_size = fc_hidden_size
        self.dropout_probability = dropout_probability
        # Layers ----------------------------------------------
        self.all_conv_high = torch.nn.ModuleList([torch.nn.Sequential(
            torch.nn.Conv1d(in_channels=1, out_channels=8, kernel_size=7, padding=3),
            torch.nn.ReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=8, out_channels=4, kernel_size=7, padding=3),
            torch.nn.ReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=4, out_channels=4, kernel_size=7, padding=3),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=self.dropout_probability),
            torch.nn.AvgPool1d(2)
        ) for joint in range(1)])
        self.all_conv_low = torch.nn.ModuleList([torch.nn.Sequential(
            torch.nn.Conv1d(in_channels=1, out_channels=8, kernel_size=3, padding=1),
            torch.nn.ReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=8, out_channels=4, kernel_size=3, padding=1),
            torch.nn.ReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=4, out_channels=4, kernel_size=3, padding=1),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=self.dropout_probability),
            torch.nn.AvgPool1d(2)
        ) for joint in range(1)])
        self.all_residual = torch.nn.ModuleList([torch.nn.Sequential(
            torch.nn.AvgPool1d(2),
            torch.nn.AvgPool1d(2),
            torch.nn.AvgPool1d(2)
        ) for joint in range(1)])
        self.fc = torch.nn.Sequential(
            torch.nn.Linear(in_features=9 * n_channels * 12, out_features=self.fc_hidden_size),  # <-- 12: depends of the sequences lengths (cf. below)
            torch.nn.ReLU(),
            torch.nn.Linear(in_features=self.fc_hidden_size, out_features=n_classes)
        )
        # Initialization --------------------------------------
        # Xavier init
        for module in itertools.chain(self.all_conv_high, self.all_conv_low, self.all_residual):
            for layer in module:
                if layer.__class__.__name__ == "Conv1d":
                    torch.nn.init.xavier_uniform_(layer.weight, gain=torch.nn.init.calculate_gain('relu'))
                    torch.nn.init.constant_(layer.bias, 0.1)

        for layer in self.fc:
            if layer.__class__.__name__ == "Linear":
                torch.nn.init.xavier_uniform_(layer.weight, gain=torch.nn.init.calculate_gain('relu'))
                torch.nn.init.constant_(layer.bias, 0.1)
    def forward(self, input):
        # Work on each channel separately
        all_features = []
        for channel in range(0, self.n_channels):
            input_channel = input[:, :, channel]
            # Add a dummy (spatial) dimension for the time convolutions
            # Conv1D format : (batch_size, n_feature_maps, duration)
            input_channel = input_channel.unsqueeze(1)
            high = self.all_conv_high[0](input_channel)
            low = self.all_conv_low[0](input_channel)
            ap_residual = self.all_residual[0](input_channel)
            # Time convolutions are concatenated along the feature maps axis
            output_channel = torch.cat([high, low, ap_residual], dim=1)
            all_features.append(output_channel)
        # Concatenate along the feature maps axis
        all_features = torch.cat(all_features, dim=1)
        # Flatten for the Linear layers
        all_features = all_features.view(-1, 9 * self.n_channels * 12)  # <-- 12: depends of the initial sequence length (100).
        # If you have shorter/longer sequences, you probably do NOT even need to modify the modify the network architecture:
        # resampling your input gesture from T timesteps to 100 timesteps will (surprisingly) probably actually work as well!
        # Fully-Connected Layers
        output = self.fc(all_features)
        return output