initial code submission

b41893c1 · NingMa · GitHub · 61d30a2e · b41893c1 · b41893c1
Unverified Commit b41893c1 authored Jul 18, 2022 by NingMa Committed by GitHub Jul 18, 2022
31 changed files
--- a/Readme.txt
+++ b/Readme.txt
+[environment]
+pytorch=1.7
+python=3.8.10
+tqdm=4.54.1
+pynvml=8.0.4
+[dataset] 
+Due to the oversize dataset（ beyond maximum limitation 100M, please download the public NTU RGB+D 120 dataset or use our subset when the work is accepted.
+[run]
+#run train.py with default parameters(DAST w/ RankMax, 5-way-1-shot, STGCN, on NTU RGB+D 120). 
+python train.py --SA 0 --reg 0.1
+#run train.py with spatial activateion(DAST w/ SA, 5-way-1-shot, STGCN, on NTU RGB+D 120). 
+python train.py --SA 1 --reg 0
+#run train.py with full model(DAST (full), 5-way-1-shot, STGCN, on NTU RGB+D 120). 
+python train.py --SA 1 --reg 0.1
\ No newline at end of file
--- a/cross_attention.py
+++ b/cross_attention.py
+import torch
+from torch import nn
+import math
+class LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-12):
+        """Construct a layernorm module in the TF style (epsilon inside the square root).
+        """
+        super(LayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, x):
+        u = x.mean(-1, keepdim=True)
+        s = (x - u).pow(2).mean(-1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+        return self.weight * x + self.bias
+class CrossAttention(nn.Module):
+    def __init__(self, num_attention_heads, input_size, hidden_size, hidden_dropout_prob):
+        super(CrossAttention, self).__init__()
+        if hidden_size % num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (hidden_size, num_attention_heads))
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_size = int(hidden_size / num_attention_heads)
+        self.all_head_size = hidden_size
+        self.query = nn.Linear(input_size, self.all_head_size)
+        self.key = nn.Linear(input_size, self.all_head_size)
+        self.value = nn.Linear(input_size, self.all_head_size)
+        attention_probs_dropout_prob = 0.2
+        self.attn_dropout = nn.Dropout(attention_probs_dropout_prob)
+        # 做完self-attention 做一个前馈全连接 LayerNorm 输出
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.LayerNorm = LayerNorm(hidden_size, eps=1e-12)
+        self.out_dropout = nn.Dropout(hidden_dropout_prob)
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(self, x, y):
+        mixed_query_layer = self.query(x)
+        mixed_key_layer = self.key(y)
+        mixed_value_layer = self.value(x)
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        # [batch_size heads seq_len seq_len] scores
+        # [batch_size 1 1 seq_len]
+        # attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        # Fixme
+        attention_probs = self.attn_dropout(attention_probs)
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        hidden_states = self.dense(context_layer)
+        hidden_states = self.out_dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + x)  # residual
+        return hidden_states
--- a/gl.py
+++ b/gl.py
+import numpy as np
+epoch=0
+device='cuda:5'
+experiment_root='../output'
+debug=False
+local_match=0
+reg_rate=0
+threshold=3
+gamma=0.1
+iter=0
+R_=np.random.randn(250, 15, 15)
+D_=np.random.randn(250, 15, 15)
+mod='train'
+backbone='st_gcn'
+dataset='ntu120'
+SA=0
\ No newline at end of file
--- a/graph/__init__.py
+++ b/graph/__init__.py
+from . import tools
+from . import ntu_rgb_d
+from . import kinetics
--- a/graph/__pycache__/__init__.cpython-38.pyc
+++ b/graph/__pycache__/__init__.cpython-38.pyc
--- a/graph/__pycache__/kinetics.cpython-38.pyc
+++ b/graph/__pycache__/kinetics.cpython-38.pyc
--- a/graph/__pycache__/ntu_rgb_d.cpython-38.pyc
+++ b/graph/__pycache__/ntu_rgb_d.cpython-38.pyc
--- a/graph/__pycache__/tools.cpython-38.pyc
+++ b/graph/__pycache__/tools.cpython-38.pyc
--- a/graph/kinetics.py
+++ b/graph/kinetics.py
+import sys
+sys.path.insert(0, '')
+sys.path.extend(['../'])
+import numpy as np
+from graph import tools
+# Joint index:
+# {0,  "Nose"}
+# {1,  "Neck"},
+# {2,  "RShoulder"},
+# {3,  "RElbow"},
+# {4,  "RWrist"},
+# {5,  "LShoulder"},
+# {6,  "LElbow"},
+# {7,  "LWrist"},
+# {8,  "RHip"},
+# {9,  "RKnee"},
+# {10, "RAnkle"},
+# {11, "LHip"},
+# {12, "LKnee"},
+# {13, "LAnkle"},
+# {14, "REye"},
+# {15, "LEye"},
+# {16, "REar"},
+# {17, "LEar"},
+num_node = 18
+self_link = [(i, i) for i in range(num_node)]
+inward = [(4, 3), (3, 2), (7, 6), (6, 5), (13, 12), (12, 11), (10, 9), (9, 8),
+          (11, 5), (8, 2), (5, 1), (2, 1), (0, 1), (15, 0), (14, 0), (17, 15),
+          (16, 14)]
+outward = [(j, i) for (i, j) in inward]
+neighbor = inward + outward
+class AdjMatrixGraph:
+    def __init__(self, *args, **kwargs):
+        self.num_nodes = num_node
+        self.edges = neighbor
+        self.self_loops = [(i, i) for i in range(self.num_nodes)]
+        self.A_binary = tools.get_adjacency_matrix(self.edges, self.num_nodes)
+        self.A_binary_with_I = tools.get_adjacency_matrix(self.edges + self.self_loops, self.num_nodes)
+if __name__ == '__main__':
+    graph = AdjMatrixGraph()
+    A_binary = graph.A_binary
+    import matplotlib.pyplot as plt
+    print(A_binary)
+    plt.matshow(A_binary)
+    plt.show()
--- a/graph/ntu_rgb_d.py
+++ b/graph/ntu_rgb_d.py
+import sys
+sys.path.insert(0, '')
+sys.path.extend(['../'])
+import numpy as np
+from graph import tools
+num_node = 25
+self_link = [(i, i) for i in range(num_node)]
+inward_ori_index = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5), (7, 6),
+                    (8, 7), (9, 21), (10, 9), (11, 10), (12, 11), (13, 1),
+                    (14, 13), (15, 14), (16, 15), (17, 1), (18, 17), (19, 18),
+                    (20, 19), (22, 23), (23, 8), (24, 25), (25, 12)]
+inward = [(i - 1, j - 1) for (i, j) in inward_ori_index]
+outward = [(j, i) for (i, j) in inward]
+neighbor = inward + outward
+class AdjMatrixGraph:
+    def __init__(self, *args, **kwargs):
+        self.edges = neighbor
+        self.num_nodes = num_node
+        self.self_loops = [(i, i) for i in range(self.num_nodes)]
+        self.A_binary = tools.get_adjacency_matrix(self.edges, self.num_nodes)
+        self.A_binary_with_I = tools.get_adjacency_matrix(self.edges + self.self_loops, self.num_nodes)
+        self.A = tools.normalize_adjacency_matrix(self.A_binary)
+if __name__ == '__main__':
+    import matplotlib.pyplot as plt
+    graph = AdjMatrixGraph()
+    A, A_binary, A_binary_with_I = graph.A, graph.A_binary, graph.A_binary_with_I
+    f, ax = plt.subplots(1, 3)
+    ax[0].imshow(A_binary_with_I, cmap='gray')
+    ax[1].imshow(A_binary, cmap='gray')
+    ax[2].imshow(A, cmap='gray')
+    plt.show()
+    print(A_binary_with_I.shape, A_binary.shape, A.shape)
--- a/graph/tools.py
+++ b/graph/tools.py
+import numpy as np
+def edge2mat(link, num_node):
+    A = np.zeros((num_node, num_node))
+    for i, j in link:
+        A[j, i] = 1
+    return A
+def normalize_digraph(A):
+    Dl = np.sum(A, 0)
+    h, w = A.shape
+    Dn = np.zeros((w, w))
+    for i in range(w):
+        if Dl[i] > 0:
+            Dn[i, i] = Dl[i] ** (-1)
+    AD = np.dot(A, Dn)
+    return AD
+def get_spatial_graph(num_node, self_link, inward, outward):
+    I = edge2mat(self_link, num_node)
+    In = normalize_digraph(edge2mat(inward, num_node))
+    Out = normalize_digraph(edge2mat(outward, num_node))
+    A = np.stack((I, In, Out))
+    return A
+def k_adjacency(A, k, with_self=False, self_factor=1):
+    assert isinstance(A, np.ndarray)
+    I = np.eye(len(A), dtype=A.dtype)
+    if k == 0:
+        return I
+    Ak = np.minimum(np.linalg.matrix_power(A + I, k), 1) \
+       - np.minimum(np.linalg.matrix_power(A + I, k - 1), 1)
+    if with_self:
+        Ak += (self_factor * I)
+    return Ak
+def normalize_adjacency_matrix(A):
+    node_degrees = A.sum(-1)
+    degs_inv_sqrt = np.power(node_degrees, -0.5)
+    norm_degs_matrix = np.eye(len(node_degrees)) * degs_inv_sqrt
+    return (norm_degs_matrix @ A @ norm_degs_matrix).astype(np.float32)
+def get_adjacency_matrix(edges, num_nodes):
+    A = np.zeros((num_nodes, num_nodes), dtype=np.float32)
+    for edge in edges:
+        A[edge] = 1.
+    return A
\ No newline at end of file
--- a/mmskl/__init__.py
+++ b/mmskl/__init__.py
+from .st_gcn_aaai18 import ST_GCN_18
+# from .hrnet import HRNet
\ No newline at end of file
--- a/mmskl/__pycache__/__init__.cpython-38.pyc
+++ b/mmskl/__pycache__/__init__.cpython-38.pyc
--- a/mmskl/__pycache__/st_gcn_aaai18.cpython-38.pyc
+++ b/mmskl/__pycache__/st_gcn_aaai18.cpython-38.pyc
--- a/mmskl/hrnet.py
+++ b/mmskl/hrnet.py
+# # ------------------------------------------------------------------------------
+# # Copyright (c) Microsoft
+# # Licensed under the MIT License.
+# # Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# # ------------------------------------------------------------------------------
+# import logging
+#
+# import torch.nn as nn
+# from mmcv.cnn import constant_init, kaiming_init
+# from mmcv.runner import load_checkpoint
+# from torch.nn.modules.batchnorm import _BatchNorm
+#
+# BN_MOMENTUM = 0.1
+#
+# def conv3x3(in_planes, out_planes, stride=1):
+#     """3x3 convolution with padding"""
+#     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+#                      padding=1, bias=False)
+#
+#
+# class BasicBlock(nn.Module):
+#     expansion = 1
+#
+#     def __init__(self, inplanes, planes, stride=1, downsample=None):
+#         super(BasicBlock, self).__init__()
+#         self.conv1 = conv3x3(inplanes, planes, stride)
+#         self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+#         self.relu = nn.ReLU(inplace=True)
+#         self.conv2 = conv3x3(planes, planes)
+#         self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+#         self.downsample = downsample
+#         self.stride = stride
+#
+#     def forward(self, x):
+#         residual = x
+# 
+#         out = self.conv1(x)
+#         out = self.bn1(out)
+#         out = self.relu(out)
+#
+#         out = self.conv2(out)
+#         out = self.bn2(out)
+#
+#         if self.downsample is not None:
+#             residual = self.downsample(x)
+#
+#         out += residual
+#         out = self.relu(out)
+#
+#         return out
+#
+#
+# class Bottleneck(nn.Module):
+#     expansion = 4
+#
+#     def __init__(self, inplanes, planes, stride=1, downsample=None):
+#         super(Bottleneck, self).__init__()
+#         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+#         self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+#         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+#                                padding=1, bias=False)
+#         self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+#         self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
+#                                bias=False)
+#         self.bn3 = nn.BatchNorm2d(planes * self.expansion,
+#                                   momentum=BN_MOMENTUM)
+#         self.relu = nn.ReLU(inplace=True)
+#         self.downsample = downsample
+#         self.stride = stride
+#
+#     def forward(self, x):
+#         residual = x
+#
+#         out = self.conv1(x)
+#         out = self.bn1(out)
+#         out = self.relu(out)
+#
+#         out = self.conv2(out)
+#         out = self.bn2(out)
+#         out = self.relu(out)
+#
+#         out = self.conv3(out)
+#         out = self.bn3(out)
+#
+#         if self.downsample is not None:
+#             residual = self.downsample(x)
+#
+#         out += residual
+#         out = self.relu(out)
+#
+#         return out
+#
+# class HRModule(nn.Module):
+#     def __init__(self, num_branches, blocks, num_blocks, num_inchannels,
+#                  num_channels, fuse_method, multi_scale_output=True):
+#         super(HRModule, self).__init__()
+#         self._check_branches(
+#             num_branches,  num_blocks, num_inchannels, num_channels)
+#
+#         self.num_inchannels = num_inchannels
+#         self.fuse_method = fuse_method
+#         self.num_branches = num_branches
+#
+#         self.multi_scale_output = multi_scale_output
+#
+#         self.branches = self._make_branches(
+#             num_branches, blocks, num_blocks, num_channels)
+#         self.fuse_layers = self._make_fuse_layers()
+#         self.relu = nn.ReLU(True)
+#
+#     def _check_branches(self, num_branches,  num_blocks,
+#                         num_inchannels, num_channels):
+#         if num_branches != len(num_blocks):
+#             error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format(
+#                 num_branches, len(num_blocks))
+#             raise ValueError(error_msg)
+#
+#         if num_branches != len(num_channels):
+#             error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format(
+#                 num_branches, len(num_channels))
+#             raise ValueError(error_msg)
+#
+#         if num_branches != len(num_inchannels):
+#             error_msg = 'NUM_BRANCHES({}) <> NUM_INCHANNELS({})'.format(
+#                 num_branches, len(num_inchannels))
+#             raise ValueError(error_msg)
+#
+#     def _make_one_branch(self, branch_index, block, num_blocks, num_channels,
+#                          stride=1):
+#         downsample = None
+#         if stride != 1 or \
+#            self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion:
+#             downsample = nn.Sequential(
+#                 nn.Conv2d(
+#                     self.num_inchannels[branch_index],
+#                     num_channels[branch_index] * block.expansion,
+#                     kernel_size=1, stride=stride, bias=False
+#                 ),
+#                 nn.BatchNorm2d(
+#                     num_channels[branch_index] * block.expansion,
+#                     momentum=BN_MOMENTUM
+#                 ),
+#             )
+#
+#         layers = []
+#         layers.append(
+#             block(
+#                 self.num_inchannels[branch_index],
+#                 num_channels[branch_index],
+#                 stride,
+#                 downsample
+#             )
+#         )
+#         self.num_inchannels[branch_index] = \
+#             num_channels[branch_index] * block.expansion
+#         for i in range(1, num_blocks[branch_index]):
+#             layers.append(
+#                 block(
+#                     self.num_inchannels[branch_index],
+#                     num_channels[branch_index]
+#                 )
+#             )
+#
+#         return nn.Sequential(*layers)
+#
+#     def _make_branches(self, num_branches, block, num_blocks, num_channels):
+#         branches = []
+#
+#         for i in range(num_branches):
+#             branches.append(
+#                 self._make_one_branch(i, block, num_blocks, num_channels)
+#             )
+#
+#         return nn.ModuleList(branches)
+#
+#     def _make_fuse_layers(self):
+#         if self.num_branches == 1:
+#             return None
+#
+#         num_branches = self.num_branches
+#         num_inchannels = self.num_inchannels
+#         fuse_layers = []
+#         for i in range(num_branches if self.multi_scale_output else 1):
+#             fuse_layer = []
+#             for j in range(num_branches):
+#                 if j > i:
+#                     fuse_layer.append(
+#                         nn.Sequential(
+#                             nn.Conv2d(
+#                                 num_inchannels[j],
+#                                 num_inchannels[i],
+#                                 1, 1, 0, bias=False
+#                             ),
+#                             nn.BatchNorm2d(num_inchannels[i]),
+#                             nn.Upsample(scale_factor=2**(j-i), mode='nearest')
+#                         )
+#                     )
+#                 elif j == i:
+#                     fuse_layer.append(None)
+#                 else:
+#                     conv3x3s = []
+#                     for k in range(i-j):
+#                         if k == i - j - 1:
+#                             num_outchannels_conv3x3 = num_inchannels[i]
+#                             conv3x3s.append(
+#                                 nn.Sequential(
+#                                     nn.Conv2d(
+#                                         num_inchannels[j],
+#                                         num_outchannels_conv3x3,
+#                                         3, 2, 1, bias=False
+#                                     ),
+#                                     nn.BatchNorm2d(num_outchannels_conv3x3)
+#                                 )
+#                             )
+#                         else:
+#                             num_outchannels_conv3x3 = num_inchannels[j]
+#                             conv3x3s.append(
+#                                 nn.Sequential(
+#                                     nn.Conv2d(
+#                                         num_inchannels[j],
+#                                         num_outchannels_conv3x3,
+#                                         3, 2, 1, bias=False
+#                                     ),
+#                                     nn.BatchNorm2d(num_outchannels_conv3x3),
+#                                     nn.ReLU(True)
+#                                 )
+#                             )
+#                     fuse_layer.append(nn.Sequential(*conv3x3s))
+#             fuse_layers.append(nn.ModuleList(fuse_layer))
+#
+#         return nn.ModuleList(fuse_layers)
+#
+#     def get_num_inchannels(self):
+#         return self.num_inchannels
+#
+#     def forward(self, x):
+#         if self.num_branches == 1:
+#             return [self.branches[0](x[0])]
+#
+#         for i in range(self.num_branches):
+#             x[i] = self.branches[i](x[i])
+#
+#         x_fuse = []
+#
+#         for i in range(len(self.fuse_layers)):
+#             y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
+#             for j in range(1, self.num_branches):
+#                 if i == j:
+#                     y = y + x[j]
+#                 else:
+#                     y = y + self.fuse_layers[i][j](x[j])
+#             x_fuse.append(self.relu(y))
+#
+#         return x_fuse
+#
+# class HRNet(nn.Module):
+#     blocks_dict = {
+#         'BASIC' : BasicBlock,
+#         'BOTTLENECK':Bottleneck
+#     }
+#
+#     def __init__(self, extra, **kwargs):
+#         self.inplanes = 64
+#         self.extra = extra
+#         super(HRNet, self).__init__()
+#
+#         # stem net
+#         self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1,
+#                                bias=False)
+#         self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+#         self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1,
+#                                bias=False)
+#         self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+#         self.relu = nn.ReLU(inplace=True)
+#         # stage1
+#         self.stage1_cfg = self.extra['stage1']
+#         num_channels = self.stage1_cfg['num_channels'][0]
+#         block_type = self.stage1_cfg['block']
+#         num_blocks = self.stage1_cfg['num_blocks'][0]
+#
+#         block = self.blocks_dict[block_type]
+#         stage1_out_channels = num_channels * block.expansion
+#         self.layer1 = self._make_layer(block,
+#                                        num_channels,
+#                                        num_blocks
+#                                        )
+#
+#         self.stage2_cfg = self.extra['stage2']
+#         num_channels = self.stage2_cfg['num_channels']
+#         block_type = self.stage2_cfg['block']
+#
+#         block = self.blocks_dict[block_type]
+#         num_channels = [channel * block.expansion for channel in num_channels]
+#         self.transition1 = self._make_transition_layer([stage1_out_channels],
+#                                                        num_channels)
+#         self.stage2, pre_stage_channels = self._make_stage(
+#             self.stage2_cfg, num_channels)
+#
+#         # stage 3
+#
+#         self.stage3_cfg = self.extra['stage3']
+#         num_channels = self.stage3_cfg['num_channels']
+#         block_type = self.stage3_cfg['block']
+#
+#         block = self.blocks_dict[block_type]
+#         num_channels = [channel * block.expansion for channel in num_channels]
+#         self.transition2 = self._make_transition_layer(pre_stage_channels,
+#                                                        num_channels)
+#         self.stage3, pre_stage_channels = self._make_stage(
+#             self.stage3_cfg, num_channels)
+#
+#         # stage 4
+#         self.stage4_cfg = self.extra['stage4']
+#         num_channels = self.stage4_cfg['num_channels']
+#         block_type = self.stage4_cfg['block']
+#
+#         block = self.blocks_dict[block_type]
+#         num_channels = [channel * block.expansion for channel in num_channels]
+#         self.transition3 = self._make_transition_layer(pre_stage_channels,
+#                                                        num_channels)
+#         self.stage4, pre_stage_channels = self._make_stage(
+#             self.stage4_cfg, num_channels)
+#         self.init_weights()
+#
+#
+#     def _make_transition_layer(
+#             self,
+#             num_channels_pre_layer,
+#             num_channels_cur_layer):
+#         num_branches_cur = len(num_channels_cur_layer)
+#         num_branches_pre = len(num_channels_pre_layer)
+#
+#         transition_layers = []
+#         for i in range(num_branches_cur):
+#             if i < num_branches_pre:
+#                 if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+#                     transition_layers.append(
+#                         nn.Sequential(
+#                             nn.Conv2d(
+#                                 num_channels_pre_layer[i],
+#                                 num_channels_cur_layer[i],
+#                                 3, 1, 1, bias=False
+#                             ),
+#                             nn.BatchNorm2d(num_channels_cur_layer[i]),
+#                             nn.ReLU(inplace=True)
+#                         )
+#                     )
+#                 else:
+#                     transition_layers.append(None)
+#             else:
+#                 conv3x3s = []
+#                 for j in range(i+1-num_branches_pre):
+#                     inchannels = num_channels_pre_layer[-1]
+#                     outchannels = num_channels_cur_layer[i] \
+#                         if j == i-num_branches_pre else inchannels
+#                     conv3x3s.append(
+#                         nn.Sequential(
+#                             nn.Conv2d(
+#                                 inchannels, outchannels, 3, 2, 1, bias=False
+#                             ),
+#                             nn.BatchNorm2d(outchannels),
+#                             nn.ReLU(inplace=True)
+#                         )
+#                     )
+#                 transition_layers.append(nn.Sequential(*conv3x3s))
+#
+#         return nn.ModuleList(transition_layers)
+#
+#     def _make_layer(self,
+#                     block,
+#                     planes,
+#                     blocks,
+#                     stride=1):
+#         downsample = None
+#         if stride != 1 or self.inplanes != planes * block.expansion:
+#             downsample = nn.Sequential(
+#                 nn.Conv2d(
+#                     self.inplanes, planes * block.expansion,
+#                     kernel_size=1, stride=stride, bias=False
+#                 ),
+#                 nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
+#             )
+#
+#         layers = []
+#         layers.append(block(self.inplanes, planes, stride, downsample))
+#         self.inplanes = planes * block.expansion
+#         for i in range(1, blocks):
+#             layers.append(block(self.inplanes, planes))
+#
+#         return nn.Sequential(*layers)
+#
+#     def _make_stage(self, layer_config, num_inchannels,
+#                     multi_scale_output=True):
+#         num_modules = layer_config['num_modules']
+#         num_branches = layer_config['num_branches']
+#         num_blocks = layer_config['num_blocks']
+#         num_channels = layer_config['num_channels']
+#         block = self.blocks_dict[layer_config['block']]
+#         fuse_method = layer_config['fuse_method']
+#
+#         modules = []
+#         for i in range(num_modules):
+#             # multi_scale_output is only used last module
+#             if not multi_scale_output and i == num_modules - 1:
+#                 reset_multi_scale_output = False
+#             else:
+#                 reset_multi_scale_output = True
+#
+#             modules.append(
+#                 HRModule(
+#                     num_branches,
+#                     block,
+#                     num_blocks,
+#                     num_inchannels,
+#                     num_channels,
+#                     fuse_method,
+#                     reset_multi_scale_output
+#                 )
+#             )
+#             num_inchannels = modules[-1].get_num_inchannels()
+#
+#         return nn.Sequential(*modules), num_inchannels
+#
+#     def forward(self, x):
+#         x = self.conv1(x)
+#         x = self.bn1(x)
+#         x = self.relu(x)
+#         x = self.conv2(x)
+#         x = self.bn2(x)
+#         x = self.relu(x)
+#         x = self.layer1(x)
+#
+#         x_list = []
+#         for i in range(self.stage2_cfg['num_branches']):
+#             if self.transition1[i] is not None:
+#                 x_list.append(self.transition1[i](x))
+#             else:
+#                 x_list.append(x)
+#         y_list = self.stage2(x_list)
+#
+#         x_list = []
+#         for i in range(self.stage3_cfg['num_branches']):
+#             if self.transition2[i] is not None:
+#                 x_list.append(self.transition2[i](y_list[-1]))
+#             else:
+#                 x_list.append(y_list[i])
+#         y_list = self.stage3(x_list)
+#
+#         x_list = []
+#         for i in range(self.stage4_cfg['num_branches']):
+#             if self.transition3[i] is not None:
+#                 x_list.append(self.transition3[i](y_list[-1]))
+#             else:
+#                 x_list.append(y_list[i])
+#         y_list = self.stage4(x_list)
+#         return y_list
+#
+#     def init_weights(self, pretrained=None):
+#         if isinstance(pretrained, str):
+#             logger = logging.getLogger()
+#             load_checkpoint(self, pretrained, strict=False, logger=logger)
+#         elif pretrained is None:
+#             for m in self.modules():
+#                 if isinstance(m, nn.Conv2d):
+#                     kaiming_init(m)
+#                 elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+#                     constant_init(m, 1)
\ No newline at end of file
--- a/mmskl/st_gcn/__init__.py
+++ b/mmskl/st_gcn/__init__.py
+from .gconv_origin import ConvTemporalGraphical
+from .graph import Graph
\ No newline at end of file
--- a/mmskl/st_gcn/__pycache__/__init__.cpython-38.pyc
+++ b/mmskl/st_gcn/__pycache__/__init__.cpython-38.pyc
--- a/mmskl/st_gcn/__pycache__/gconv_origin.cpython-38.pyc
+++ b/mmskl/st_gcn/__pycache__/gconv_origin.cpython-38.pyc
--- a/mmskl/st_gcn/__pycache__/graph.cpython-38.pyc
+++ b/mmskl/st_gcn/__pycache__/graph.cpython-38.pyc
--- a/mmskl/st_gcn/gconv.py
+++ b/mmskl/st_gcn/gconv.py
+# The based unit of graph convolutional networks.
+import torch
+import torch.nn as nn
+class GraphConvND(nn.Module):
+    def __init__(self, N, in_channels, out_channels, kernel_size, stride,
+                 padding, dilation, groups, bias, padding_mode):
+        graph_kernel_size = kernel_size[0]
+        graph_stride = stride[0]
+        graph_padding = padding[0]
+        graph_dilation = dilation[0]
+        if graph_stride != 1 or graph_padding != 0 or graph_dilation != 1:
+            raise NotImplementedError
+        if N == 1:
+            conv_type = nn.Conv1d
+            self.einsum_func = 'nkcv,kvw->ncw'
+        elif N == 2:
+            conv_type = nn.Conv2d
+            self.einsum_func = 'nkcvx,kvw->ncwx'
+        elif N == 3:
+            conv_type = nn.Conv3d
+            self.einsum_func = 'nkcvxy,kvw->ncwxy'
+        self.out_channels = out_channels
+        self.graph_kernel_size = graph_kernel_size
+        self.conv = conv_type(in_channels,
+                              out_channels * graph_kernel_size,
+                              kernel_size=[1] + kernel_size[1:],
+                              stride=[1] + stride[1:],
+                              padding=[0] + padding[1:],
+                              dilation=[1] + dilation[1:],
+                              groups=groups,
+                              bias=bias,
+                              padding_mode=padding_mode)
+    def forward(self, x, graph):
+        # graph is an adjacency matrix
+        if graph.dim() == 2:
+            A, out_graph = self.normalize_adjacency_matrix(graph)
+        # graph is a weight matrix
+        elif graph.dim() == 3:
+            A, out_graph = graph, None
+        else:
+            raise ValueError('input[1].dim() should be 2 or 3.')
+        x = self.conv(x)
+        x = x.view((x.size(0), self.graph_kernel_size, self.out_channels) +
+                   x.size()[2:])
+        x = torch.einsum(self.einsum_func, (x, A))
+        return x.contiguous(), out_graph
+    def normalize_adjacency_matrix(self, graph):
+        raise NotImplementedError
+        return None, graph
+class GraphConv(GraphConvND):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 padding_mode='zeros'):
+        super().__init__(1, in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode)
+class GraphConv2D(GraphConvND):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=(1, 1),
+                 padding=(0, 0),
+                 dilation=(1, 1),
+                 groups=1,
+                 bias=True,
+                 padding_mode='zeros'):
+        super().__init__(2, in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode)
+class GraphConv3D(GraphConvND):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=(1, 1, 1),
+                 padding=(0, 0, 0),
+                 dilation=(1, 1, 1),
+                 groups=1,
+                 bias=True,
+                 padding_mode='zeros'):
+        super().__init__(3, in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode)
\ No newline at end of file
--- a/mmskl/st_gcn/gconv_origin.py
+++ b/mmskl/st_gcn/gconv_origin.py
+# The based unit of graph convolutional networks.
+# This is the original implementation for ST-GCN papers.
+import numpy as np
+import torch
+import torch.nn as nn
+class ConvTemporalGraphical(nn.Module):
+    r"""The basic module for applying a graph convolution.
+    Args:
+        in_channels (int): Number of channels in the input sequence data
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int): Size of the graph convolving kernel
+        t_kernel_size (int): Size of the temporal convolving kernel
+        t_stride (int, optional): Stride of the temporal convolution. Default: 1
+        t_padding (int, optional): Temporal zero-padding added to both sides of
+            the input. Default: 0
+        t_dilation (int, optional): Spacing between temporal kernel elements.
+            Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output.
+            Default: ``True``
+    Shape:
+        - Input[0]: Input graph sequence in :math:`(N, in_channels, T_{in}, V)` format
+        - Input[1]: Input graph adjacency matrix in :math:`(K, V, V)` format
+        - Output[0]: Output graph sequence in :math:`(N, out_channels, T_{out}, V)` format
+        - Output[1]: Graph adjacency matrix for output data in :math:`(K, V, V)` format
+        where
+            :math:`N` is a batch size,
+            :math:`K` is the spatial kernel size, as :math:`K == kernel_size[1]`,
+            :math:`T_{in}/T_{out}` is a length of input/output sequence,
+            :math:`V` is the number of graph nodes. 
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 t_kernel_size=1,
+                 t_stride=1,
+                 t_padding=0,
+                 t_dilation=1,
+                 bias=True):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.conv = nn.Conv2d(in_channels,
+                              out_channels * kernel_size,
+                              kernel_size=(t_kernel_size, 1),
+                              padding=(t_padding, 0),
+                              stride=(t_stride, 1),
+                              dilation=(t_dilation, 1),
+                              bias=bias)
+        self.out_channels = out_channels
+    def forward(self, x, A):
+        assert A.size(0) == self.kernel_size
+        x = self.conv(x)
+        n, kc, t, v = x.size()
+        x = x.view(n, self.kernel_size, kc // self.kernel_size, t, v)
+        x = torch.einsum('nkctv,kvw->nctw', (x, A))
+        return x.contiguous(), A
+class Gconv(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size):
+        if isinstance(kernel_size, int):
+            gcn_kernel_size = kernel_size
+            feature_dim = 0
+        if isinstance(kernel_size, list) or isinstance(kernel_size, tuple):
+            gcn_kernel_size = kernel_size[0]
+            cnn_kernel_size = [1] + kernel_size[1:]
+            feature_dim = len(kernel_size) - 1
+        else:
+            raise ValueError(
+                'The type of kernel_size should be int, list or tuple.')
+        if feature_dim == 1:
+            self.conv = nn.Conv1d(in_channels,
+                                  out_channels * gcn_kernel_size,
+                                  kernel_size=cnn_kernel_size)
+        elif feature_dim == 2:
+            pass
+        elif feature_dim == 3:
+            pass
+        elif feature_dim == 0:
+            pass
+        else:
+            raise ValueError(
+                'The length of kernel_size should be 1, 2, 3, or 4')
+    def forward(self, X, A):
+        pass
\ No newline at end of file
--- a/mmskl/st_gcn/graph.py
+++ b/mmskl/st_gcn/graph.py
+import numpy as np
+class Graph():
+    """ The Graph to model the skeletons extracted by the openpose
+    Args:
+        strategy (string): must be one of the follow candidates
+        - uniform: Uniform Labeling
+        - distance: Distance Partitioning
+        - spatial: Spatial Configuration
+        For more information, please refer to the section 'Partition Strategies'
+            in our paper (https://arxiv.org/abs/1801.07455).
+        layout (string): must be one of the follow candidates
+        - openpose: Is consists of 18 joints. For more information, please
+            refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose#output
+        - ntu-rgb+d: Is consists of 25 joints. For more information, please
+            refer to https://github.com/shahroudy/NTURGB-D
+        max_hop (int): the maximal distance between two connected nodes
+        dilation (int): controls the spacing between the kernel points
+    """
+    def __init__(self,
+                 layout='openpose',
+                 strategy='uniform',
+                 max_hop=1,
+                 dilation=1):
+        self.max_hop = max_hop
+        self.dilation = dilation
+        self.get_edge(layout)
+        #self_dis 0, edge_dis 1, not_connect_dis inf
+        self.hop_dis = get_hop_distance(self.num_node,
+                                        self.edge,
+                                        max_hop=max_hop)
+        self.get_adjacency(strategy)
+    def __str__(self):
+        return self.A
+    def get_edge(self, layout):
+        # edge is a list of [child, parent] paris
+        if layout == 'openpose':
+            self.num_node = 18
+            self_link = [(i, i) for i in range(self.num_node)]
+            neighbor_link = [(4, 3), (3, 2), (7, 6), (6, 5),
+                             (13, 12), (12, 11), (10, 9), (9, 8), (11, 5),
+                             (8, 2), (5, 1), (2, 1), (0, 1), (15, 0), (14, 0),
+                             (17, 15), (16, 14)]
+            self.edge = self_link + neighbor_link
+            self.center = 1
+        elif layout == 'ntu-rgb+d':
+            self.num_node = 25
+            self_link = [(i, i) for i in range(self.num_node)]
+            neighbor_1base = [(1, 2), (2, 21), (3, 21),
+                              (4, 3), (5, 21), (6, 5), (7, 6), (8, 7), (9, 21),
+                              (10, 9), (11, 10), (12, 11), (13, 1), (14, 13),
+                              (15, 14), (16, 15), (17, 1), (18, 17), (19, 18),
+                              (20, 19), (22, 23), (23, 8), (24, 25), (25, 12)]
+            neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base]
+            self.edge = self_link + neighbor_link
+            self.center = 21 - 1
+        elif layout == 'ntu_edge':
+            self.num_node = 24
+            self_link = [(i, i) for i in range(self.num_node)]
+            neighbor_1base = [(1, 2), (3, 2), (4, 3), (5, 2), (6, 5), (7, 6),
+                              (8, 7), (9, 2), (10, 9), (11, 10), (12, 11),
+                              (13, 1), (14, 13), (15, 14), (16, 15), (17, 1),
+                              (18, 17), (19, 18), (20, 19), (21, 22), (22, 8),
+                              (23, 24), (24, 12)]
+            neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base]
+            self.edge = self_link + neighbor_link
+            self.center = 2
+        elif layout == 'coco':
+            self.num_node = 17
+            self_link = [(i, i) for i in range(self.num_node)]
+            neighbor_1base = [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13],
+                              [6, 12], [7, 13], [6, 7], [8, 6], [9, 7],
+                              [10, 8], [11, 9], [2, 3], [2, 1], [3, 1], [4, 2],
+                              [5, 3], [4, 6], [5, 7]]
+            neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base]
+            self.edge = self_link + neighbor_link
+            self.center = 0
+        # elif layout=='customer settings'
+        #     pass
+        else:
+            raise ValueError("Do Not Exist This Layout.")
+    def get_adjacency(self, strategy):
+        valid_hop = range(0, self.max_hop + 1, self.dilation)
+        adjacency = np.zeros((self.num_node, self.num_node))
+        for hop in valid_hop:
+            adjacency[self.hop_dis == hop] = 1 #self and connect =1
+        normalize_adjacency = normalize_digraph(adjacency)
+        if strategy == 'uniform':
+            A = np.zeros((1, self.num_node, self.num_node))
+            A[0] = normalize_adjacency
+            self.A = A
+        elif strategy == 'distance':
+            A = np.zeros((len(valid_hop), self.num_node, self.num_node))
+            for i, hop in enumerate(valid_hop):
+                A[i][self.hop_dis == hop] = normalize_adjacency[self.hop_dis ==
+                                                                hop]
+            self.A = A
+        elif strategy == 'spatial':
+            A = []
+            for hop in valid_hop:
+                a_root = np.zeros((self.num_node, self.num_node))
+                a_close = np.zeros((self.num_node, self.num_node))
+                a_further = np.zeros((self.num_node, self.num_node))
+                for i in range(self.num_node):
+                    for j in range(self.num_node):
+                        if self.hop_dis[j, i] == hop:
+                            if self.hop_dis[j, self.center] == self.hop_dis[
+                                    i, self.center]:
+                                a_root[j, i] = normalize_adjacency[j, i]
+                            elif self.hop_dis[j, self.center] > self.hop_dis[
+                                    i, self.center]:
+                                a_close[j, i] = normalize_adjacency[j, i]
+                            else:
+                                a_further[j, i] = normalize_adjacency[j, i]
+                if hop == 0:
+                    A.append(a_root)
+                else:
+                    A.append(a_root + a_close)
+                    A.append(a_further)
+            A = np.stack(A)
+            self.A = A
+        else:
+            raise ValueError("Do Not Exist This Strategy")
+def get_hop_distance(num_node, edge, max_hop=1):
+    A = np.zeros((num_node, num_node))
+    for i, j in edge:
+        A[j, i] = 1
+        A[i, j] = 1
+    # compute hop steps
+    hop_dis = np.zeros((num_node, num_node)) + np.inf
+    transfer_mat = [np.linalg.matrix_power(A, d) for d in range(max_hop + 1)]
+    arrive_mat = (np.stack(transfer_mat) > 0)
+    for d in range(max_hop, -1, -1):
+        hop_dis[arrive_mat[d]] = d
+    return hop_dis
+def normalize_digraph(A):
+    Dl = np.sum(A, 0)
+    num_node = A.shape[0]
+    Dn = np.zeros((num_node, num_node))
+    for i in range(num_node):
+        if Dl[i] > 0:
+            Dn[i, i] = Dl[i]**(-1)
+    AD = np.dot(A, Dn)
+    return AD
+def normalize_undigraph(A):
+    Dl = np.sum(A, 0)
+    num_node = A.shape[0]
+    Dn = np.zeros((num_node, num_node))
+    for i in range(num_node):
+        if Dl[i] > 0:
+            Dn[i, i] = Dl[i]**(-0.5)
+    DAD = np.dot(np.dot(Dn, A), Dn)
+    return DAD
\ No newline at end of file
--- a/mmskl/st_gcn_aaai18.py
+++ b/mmskl/st_gcn_aaai18.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+from mmskl.st_gcn import ConvTemporalGraphical, Graph
+import numpy as np
+def zero(x):
+    return 0
+def iden(x):
+    return x
+class ST_GCN_18(nn.Module):
+    r"""Spatial temporal graph convolutional networks.
+    Args:
+        in_channels (int): Number of channels in the input data
+        num_class (int): Number of classes for the classification task
+        graph_cfg (dict): The arguments for building the graph
+        edge_importance_weighting (bool): If ``True``, adds a learnable
+            importance weighting to the edges of the graph
+        **kwargs (optional): Other parameters for graph convolution units
+    Shape:
+        - Input: :math:`(N, in_channels, T_{in}, V_{in}, M_{in})`
+        - Output: :math:`(N, num_class)` where
+            :math:`N` is a batch size,
+            :math:`T_{in}` is a length of input sequence,
+            :math:`V_{in}` is the number of graph nodes,
+            :math:`M_{in}` is the number of instance in a frame.
+    """
+    def __init__(self,
+                 in_channels,
+                 num_class,
+                 graph_cfg,
+                 edge_importance_weighting=True,
+                 data_bn=True,
+                 **kwargs):
+        super().__init__()
+        # load graph
+        self.graph = Graph(**graph_cfg)
+        A = torch.tensor(self.graph.A,
+                         dtype=torch.float32,
+                         requires_grad=False)
+        self.register_buffer('A', A)
+        # build networks
+        spatial_kernel_size = A.size(0)
+        temporal_kernel_size = 9
+        kernel_size = (temporal_kernel_size, spatial_kernel_size)
+        self.data_bn = nn.BatchNorm1d(in_channels *
+                                      A.size(1)) if data_bn else iden
+        kwargs0 = {k: v for k, v in kwargs.items() if k != 'dropout'}
+        # self.st_gcn_networks = nn.ModuleList((
+        #     st_gcn_block(in_channels,
+        #                  64,
+        #                  kernel_size,
+        #                  1,
+        #                  residual=False,
+        #                  **kwargs0),
+        #     st_gcn_block(64, 64, kernel_size, 2, **kwargs),
+        #     st_gcn_block(64, 128, kernel_size, 2, **kwargs),
+        #     st_gcn_block(128, 128, kernel_size, 1, **kwargs),
+        #     st_gcn_block(128, 256, kernel_size, 2, **kwargs),
+        #     st_gcn_block(256, 256, kernel_size, 2, **kwargs),
+        #     st_gcn_block(256, 64, kernel_size, 1, **kwargs),
+        # ))
+        # self.st_gcn_networks = nn.ModuleList((
+        #     st_gcn_block(in_channels,
+        #                  64,
+        #                  kernel_size,
+        #                  1,
+        #                  residual=False,
+        #                  **kwargs0),
+        #     st_gcn_block(64, 64, kernel_size, 1, **kwargs),
+        #     st_gcn_block(64, 128, kernel_size, 2, **kwargs),
+        #     st_gcn_block(128, 128, kernel_size, 1, **kwargs),
+        #     st_gcn_block(128, 256, kernel_size, 2, **kwargs),
+        #     st_gcn_block(256, 256, kernel_size, 1, **kwargs),
+        #     st_gcn_block(256, 64, kernel_size, 1, **kwargs),
+        # ))
+        self.st_gcn_networks = nn.ModuleList((
+            st_gcn_block(in_channels,
+                         64,
+                         kernel_size,
+                         1,
+                         residual=False,
+                         **kwargs0),
+            st_gcn_block(64, 64, kernel_size, 1, **kwargs),
+            st_gcn_block(64, 64, kernel_size, 1, **kwargs),
+            st_gcn_block(64, 64, kernel_size, 1, **kwargs),
+            st_gcn_block(64, 128, kernel_size, 2, **kwargs),
+            st_gcn_block(128, 128, kernel_size, 1, **kwargs),
+            st_gcn_block(128, 128, kernel_size, 1, **kwargs),
+            st_gcn_block(128, 256, kernel_size, 2, **kwargs),
+            st_gcn_block(256, 256, kernel_size, 1, **kwargs),
+            st_gcn_block(256, 256, kernel_size, 1, **kwargs),
+        ))
+        # initialize parameters for edge importance weighting
+        if edge_importance_weighting:
+            self.edge_importance = nn.ParameterList([
+                nn.Parameter(torch.ones(self.A.size()))
+                for i in self.st_gcn_networks
+            ])
+        else:
+            self.edge_importance = [1] * len(self.st_gcn_networks)
+        # fcn for prediction
+        # self.fcn = nn.Conv2d(256, num_class, kernel_size=1)
+    def forward(self, x):
+        # data normalization
+        N, C, T, V, M = x.size()
+        x = x.permute(0, 4, 3, 1, 2).contiguous()
+        x = x.view(N * M, V * C, T)
+        x = self.data_bn(x)
+        x = x.view(N, M, V, C, T)
+        x = x.permute(0, 1, 3, 4, 2).contiguous()
+        x = x.view(N * M, C, T, V)
+        # forward
+        for gcn, importance in zip(self.st_gcn_networks, self.edge_importance):
+            x, _ = gcn(x, self.A * importance)
+        #N channel 50 25
+        # global pooling x.size()[2:] = (300, 25)
+        # x = F.avg_pool2d(x, x.size()[2:])
+        NM, C, T, V = x.size()
+        x = x.view(N, M, -1, T, V).mean(dim=1)
+        # prediction
+        # x = self.fcn(x)
+        # x = x.view(x.size(0), -1)
+        return x
+    def extract_feature(self, x):
+        # data normalization
+        N, C, T, V, M = x.size()
+        x = x.permute(0, 4, 3, 1, 2).contiguous()
+        x = x.view(N * M, V * C, T)
+        x = self.data_bn(x)
+        x = x.view(N, M, V, C, T)
+        x = x.permute(0, 1, 3, 4, 2).contiguous()
+        x = x.view(N * M, C, T, V)
+        # forwad
+        for gcn, importance in zip(self.st_gcn_networks, self.edge_importance):
+            x, _ = gcn(x, self.A * importance)
+        _, c, t, v = x.size()
+        feature = x.view(N, M, c, t, v).permute(0, 2, 3, 4, 1)
+        # prediction
+        x = self.fcn(x)
+        output = x.view(N, M, -1, t, v).permute(0, 2, 3, 4, 1)
+        return output, feature
+class st_gcn_block(nn.Module):
+    r"""Applies a spatial temporal graph convolution over an input graph sequence.
+    Args:
+        in_channels (int): Number of channels in the input sequence data
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (tuple): Size of the temporal convolving kernel and graph convolving kernel
+        stride (int, optional): Stride of the temporal convolution. Default: 1
+        dropout (int, optional): Dropout rate of the final output. Default: 0
+        residual (bool, optional): If ``True``, applies a residual mechanism. Default: ``True``
+    Shape:
+        - Input[0]: Input graph sequence in :math:`(N, in_channels, T_{in}, V)` format
+        - Input[1]: Input graph adjacency matrix in :math:`(K, V, V)` format
+        - Output[0]: Outpu graph sequence in :math:`(N, out_channels, T_{out}, V)` format
+        - Output[1]: Graph adjacency matrix for output data in :math:`(K, V, V)` format
+        where
+            :math:`N` is a batch size,
+            :math:`K` is the spatial kernel size, as :math:`K == kernel_size[1]`,
+            :math:`T_{in}/T_{out}` is a length of input/output sequence,
+            :math:`V` is the number of graph nodes.
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 dropout=0,
+                 residual=True):
+        super().__init__()
+        assert len(kernel_size) == 2
+        assert kernel_size[0] % 2 == 1
+        padding = ((kernel_size[0] - 1) // 2, 0)
+        self.gcn = ConvTemporalGraphical(in_channels, out_channels,
+                                         kernel_size[1])
+        self.tcn = nn.Sequential(
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                out_channels,
+                out_channels,
+                (kernel_size[0], 1),
+                (stride, 1),
+                padding,
+            ),
+            nn.BatchNorm2d(out_channels),
+            nn.Dropout(dropout, inplace=True),
+        )
+        if not residual:
+            self.residual = zero
+        elif (in_channels == out_channels) and (stride == 1):
+            self.residual = iden
+        else:
+            self.residual = nn.Sequential(
+                nn.Conv2d(in_channels,
+                          out_channels,
+                          kernel_size=1,
+                          stride=(stride, 1)),
+                nn.BatchNorm2d(out_channels),
+            )
+        self.relu = nn.ReLU(inplace=True)
+        self.out_channels = out_channels
+        self.in_channels = in_channels
+    def forward(self, x, A):
+        res = self.residual(x)
+        x, A = self.gcn(x, A)
+        x = self.tcn(x) + res
+        return self.relu(x), A
--- a/nturgbd_dataset.py
+++ b/nturgbd_dataset.py
+# coding=utf-8
+from __future__ import print_function
+import torch.utils.data as data
+from PIL import Image
+import numpy as np
+import shutil
+import errno
+import torch
+import os
+import pickle
+import random
+import gl
+'''
+Inspired by https://github.com/pytorch/vision/pull/46
+'''
+IMG_CACHE = {}
+class NTU_RGBD_Dataset(data.Dataset):
+    def __init__(self, mode='train', data_list=None, debug=False, extract_frame=1, transform=None, target_transform=None):
+        '''
+        The items are (filename,category). The index of all the categories can be found in self.idx_classes
+        Args:
+        - root: the directory where the dataset will be stored
+        - transform: how to transform the input
+        - target_transform: how to transform the target
+        '''
+        super(NTU_RGBD_Dataset, self).__init__()
+        self.transform = transform
+        self.target_transform = target_transform
+        if gl.dataset == 'ntu120_30':
+            path="********************************to be specified********************************"
+            segment = 30
+        print('data_path :{}'.format(path))
+        if mode == 'train':
+            data_path = os.path.join(path, 'train_data.npy')
+            label_path = os.path.join(path, 'train_label.npy')
+            num_frame = os.path.join(path, 'train_frame.npy')
+        elif mode == 'val':
+            data_path = os.path.join(path, 'val_data.npy')
+            label_path = os.path.join(path, 'val_label.npy')
+            num_frame = os.path.join(path, 'val_frame.npy')
+        else:
+            data_path = os.path.join(path, 'test_data.npy')
+            label_path = os.path.join(path, 'test_label.npy')
+            num_frame = os.path.join(path, 'test_frame.npy')
+        self.data, self.label, self.num_frame = np.load(data_path), np.load(label_path), np.load(num_frame)
+        # print('min = ', np.min(self.data), ' max = ', np.max(self.data))
+        if debug:
+            data_len = len(self.label)
+            data_len = int(0.1 * data_len)
+            self.label = self.label[0:data_len]
+            self.data = self.data[0:data_len]
+            self.num_frame = self.num_frame[0:data_len]
+        if extract_frame == 1:
+            self.data = self.extract_frame(self.data, self.num_frame, segment)
+        print('sample_num in {}'.format(mode), len(self.label))
+        n_classes = len(np.unique(self.label))
+        print('n_class', n_classes)
+    def __getitem__(self, idx):
+        x = self.data[idx]
+        if self.transform:
+            x = self.transform(x)
+        return x, self.label[idx]
+    def __len__(self):
+        return len(self.label)
+    def extract_frame(self, x, num_frame, segment):
+        n, c, t, v, m = x.shape
+        assert n == len(num_frame)
+        num_frame = np.array(num_frame)
+        step = num_frame // segment
+        new_x = []
+        for i in range(n):
+            if num_frame[i] < segment:
+                new_x.append(np.expand_dims(x[i, :, 0:segment, :, :], 0).reshape(1, c, segment, v, m))
+                continue
+            idx = [random.randint(j * step[i], (j + 1) * step[i] - 1) for j in range(segment)]
+            new_x.append(np.expand_dims(x[i, :, idx, :, :], 0).reshape(1, c, segment, v, m))
+        new_x = np.concatenate(new_x, 0)
+        return new_x
--- a/parser_util.py
+++ b/parser_util.py
+# coding=utf-8
+import os
+import argparse
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-root', '--dataset',
+                        type=str,
+                        help='type of dataset',
+                        default='ntu120_30')
+    parser.add_argument('-mode', '--mode',
+                        type=str,
+                        help='mode',
+                        default='train')
+    parser.add_argument('-reg', '--reg_rate',
+                        type=float,
+                        help='reg',
+                        default=0.1)
+    parser.add_argument('-reg_thred', '--thred',
+                        type=int,
+                        help='threshold',
+                        default=3)
+    parser.add_argument('-gama', '--gamma',
+                        type=float,
+                        help='reg',
+                        default=0.01)
+    parser.add_argument('-dbg', '--debug',
+                        type=int,
+                        help='debug to save x and sim_tenor',
+                        default=0)
+    parser.add_argument('-model', '--model',
+                        type=int,
+                        help='use or not best model',
+                        default=0)
+    parser.add_argument('-backbone', '--backbone',
+                        type=str,
+                        help='backbone type st_gcn, 2s_AGCN, ms_g3d',
+                        default='stgcn')
+    parser.add_argument('-extrf', '--extract_frame',
+                        type=int,
+                        help='is or not extract frame',
+                        default=1)
+    parser.add_argument('-exp', '--experiment_root',
+                        type=str,
+                        help='root where to store models, losses and accuracies',
+                        default='test')
+    parser.add_argument('-d', '--device',
+                        type=int,
+                        help='GPU device',
+                        default=0)
+    parser.add_argument('-dt', '--dtw',
+                        type=int,
+                        help='if using dtw',
+                        default=1)
+    parser.add_argument('-nep', '--epochs',
+                        type=int,
+                        help='number of epochs to train for',
+                        default=100)
+    parser.add_argument('-lr', '--learning_rate',
+                        type=float,
+                        help='learning rate for the model, default=0.001',
+                        default=0.001)
+    parser.add_argument('-lrf', '--lr_flag',
+                        type=str,
+                        help='lr_scheduler type',
+                        default='reduceLR')
+    parser.add_argument('-lrS', '--lr_scheduler_step',
+                        type=int,
+                        help='StepLR learning rate scheduler step, default=20',
+                        default=20)
+    parser.add_argument('-lrG', '--lr_scheduler_gamma',
+                        type=float,
+                        help='StepLR learning rate scheduler gamma, default=0.5',
+                        default=0.5)
+    parser.add_argument('-its', '--train_iterations',
+                        type=int,
+                        help='number of episodes per epoch, default=100',
+                        default=1000)
+    parser.add_argument('-cTr', '--classes_per_it_tr',
+                        type=int,
+                        help='number of random classes per episode for training, default=60',
+                        default=5)
+    parser.add_argument('-nsTr', '--num_support_tr',
+                        type=int,
+                        help='number of samples per class to use as support for training, default=5',
+                        default=1)
+    parser.add_argument('-nqTr', '--num_query_tr',
+                        type=int,
+                        help='number of samples per class to use as query for training, default=5',
+                        default=10)
+    parser.add_argument('-test_its', '--test_iterations',
+                        type=int,
+                        help='number of episodes per epoch, default=100',
+                        default=500)
+    parser.add_argument('-cVa', '--classes_per_it_val',
+                        type=int,
+                        help='number of random classes per episode for validation, default=5',
+                        default=5)
+    parser.add_argument('-nsVa', '--num_support_val',
+                        type=int,
+                        help='number of samples per class to use as support for validation, default=5',
+                        default=1)
+    parser.add_argument('-nqVa', '--num_query_val',
+                        type=int,
+                        help='number of samples per class to use as query for validation, default=15',
+                        default=10)
+    parser.add_argument('-seed', '--manual_seed',
+                        type=int,
+                        help='input for the manual seeds initializations',
+                        default=7)
+    parser.add_argument('--cuda',
+                        action='store_true',
+                        help='enables cuda')
+    parser.add_argument('--SA',
+                        type=int,
+                        help='input for the manual seeds initializations',
+                        default=0)
+    return parser
--- a/protonet.py
+++ b/protonet.py
+import time
+import torch.nn as nn
+import torch
+import numpy as np
+from mmskl.st_gcn_aaai18 import ST_GCN_18
+from utils import get_support_query_data, extract_k_segement, compute_similarity, euclidean_dist, euclidean_distance
+from torch.nn import functional as F
+import gl
+from soft_dtw import SoftDTW
+from cross_attention import CrossAttention
+class ProtoNet(nn.Module):
+    def __init__(self, opt):
+        super(ProtoNet, self).__init__()
+        if 'ntu' in gl.dataset:
+            node = 25
+            ms_graph = 'graph.ntu_rgb_d.AdjMatrixGraph'
+            sh_grpah = 'shift_gcn_graph.ntu_rgb_d.Graph'
+            st_graph = {'layout': 'ntu-rgb+d', 'strategy': 'spatial'}
+        elif gl.dataset == 'kinetics':
+            node = 18
+            ms_graph = 'graph.kinetics.AdjMatrixGraph'
+            sh_grpah = 'shift_gcn_graph.kinetics.Graph'
+            st_graph = {'layout': 'openpose', 'strategy': 'spatial'}
+        else:
+            ms_graph = None
+            sh_grpah = None
+            st_graph = None
+            node = 0
+        self.model = ST_GCN_18(
+            in_channels=3,
+            num_class=60,
+            dropout=0.1,
+            edge_importance_weighting=False,
+            graph_cfg=st_graph
+        )
+        self.out_channel = 256
+        if gl.SA == 1:
+            self.attention_x = CrossAttention(num_attention_heads=1, input_size=self.out_channel, hidden_size=self.out_channel, hidden_dropout_prob=0.2)
+            self.attention_y = CrossAttention(num_attention_heads=1, input_size=self.out_channel, hidden_size=self.out_channel, hidden_dropout_prob=0.2)
+        else:
+            self.attention_x = None
+            self.attention_y = None
+    def loss(self, input, target, n_support, dtw):
+        # input is encoder by ST_GCN
+        n, c, t, v = input.size()
+        def supp_idxs(cc):
+            # FIXME when torch will support where as np
+            return torch.nonzero(target.eq(cc))[:n_support].squeeze(1)
+        # FIXME when torch.unique will be available on cuda too
+        classes = torch.unique(target)
+        n_class = len(classes)
+        # FIXME when torch will support where as np
+        # assuming n_query, n_target constants
+        n_query = target.eq(classes[0].item()).sum().item() - n_support
+        support_idxs = list(map(supp_idxs, classes))
+        z_proto = torch.stack([input[idx_list] for idx_list in support_idxs]).view(-1, c, t, v)
+        # FIXME when torch will support where as np
+        query_idxs = torch.stack(list(map(lambda c: torch.nonzero(target.eq(c))[n_support:], classes))).view(-1)
+        zq = input[query_idxs.long()]
+        z_proto = z_proto.view(n_class, n_support, c, t, v).mean(1)  # n, c, t, v
+        if dtw > 0:
+            dist, reg_loss = self.dtw_loss(zq, z_proto)
+        else:
+            #zq, z_proto = F.avg_pool2d(zq, zq.size()[2:]).view(n_class * n_query, c), F.avg_pool2d(z_proto, z_proto.size()[2:]).view(n_class, c)
+            zq = zq.view(n_class * n_query, -1)
+            z_proto = z_proto.view(n_class, -1)
+            dist = euclidean_dist(zq, z_proto)
+            reg_loss = torch.tensor(0).float().to(gl.device)
+        log_p_y = F.log_softmax(-dist, dim=1).view(n_class, n_query, -1)
+        target_inds = torch.arange(0, n_class).to(gl.device)
+        target_inds = target_inds.view(n_class, 1, 1)
+        target_inds = target_inds.expand(n_class, n_query, 1).long()
+        loss_val = -log_p_y.gather(2, target_inds).squeeze().view(-1).mean()
+        _, y_hat = log_p_y.max(2)
+        acc_val = y_hat.eq(target_inds.squeeze()).float().mean()
+        if gl.reg_rate > 0:
+            loss_val += reg_loss
+        return loss_val, acc_val, reg_loss
+    def dtw_loss(self, zq, z_proto):
+        if self.attention_x != None:
+            zq = zq.permute(0, 2, 3, 1).contiguous()  # n, t, v, c
+            z_proto = z_proto.permute(0, 2, 3, 1).contiguous()
+            dist = self.attention_dtw_dist(zq, z_proto)
+        else:
+            z_proto = z_proto.permute(0, 2, 3, 1).contiguous()
+            zq = zq.permute(0, 2, 3, 1).contiguous()
+            dist = self.dtw_dist(zq, z_proto)
+        reg_loss = torch.tensor(0).float().to(gl.device)
+        if gl.reg_rate > 0:
+            reg_loss = self.svd_reg_spatial(z_proto) + self.svd_reg_spatial(zq)
+            rate = gl.reg_rate
+            reg_loss = reg_loss * rate
+        return dist, reg_loss
+    def attention_dtw_dist(self, x, y):
+        '''
+            :param x: [n, t, c] z_query
+            :param y: [m, t, c] z_proto
+            :return: [n, m]
+        '''
+        n, t, v, c = x.size()
+        m, _, _, _ = y.size()
+        x = x.unsqueeze(1).expand(n, m, t, v, c).reshape(n * m, t, v, c)
+        y = y.unsqueeze(0).expand(n, m, t, v, c).reshape(n * m, t, v, c)
+        sdtw = SoftDTW(gamma=gl.gamma, normalize=False, attention=self.attention_x, attention_y=self.attention_y)
+        loss = sdtw(x, y)
+        return loss.view(n, m)
+    def dtw_dist(self, x, y):
+        if len(x.size()) == 4:
+            n, t, v, c = x.size()
+            x = x.view(n, t, v * c)
+            y = y.view(-1, t, v * c)
+        n, t, c = x.size()
+        m, _, _ = y.size()
+        x = x.unsqueeze(1).expand(n, m, t, c).reshape(n * m, t, c)
+        y = y.unsqueeze(0).expand(n, m, t, c).reshape(n * m, t, c)
+        sdtw = SoftDTW(gamma=gl.gamma, normalize=False, attention=self.attention_x, attention_y=self.attention_y)
+        loss = sdtw(x, y)
+        return loss.view(n, m)
+    def svd_reg_spatial(self, x):
+        if len(x.size()) == 4:
+            n, t, v, c = x.size()
+            x = x.view(-1,v,c)
+        loss = torch.tensor(0).float().to(gl.device)
+        for i in range(x.size()[0]):
+            transpose_X = x[i]
+            # fast version
+            softmax_tgt = torch.softmax((transpose_X - torch.max(transpose_X)), dim=1)
+            list_svd, _ = torch.sort(torch.sqrt(torch.sum(torch.pow(softmax_tgt, 2), dim=0)), descending=True)
+            method_loss = -torch.mean(list_svd[:min(softmax_tgt.shape[0], softmax_tgt.shape[1])])
+            loss += method_loss
+        return loss / x.size()[0]
+    def idm_reg(self, x):
+        n, t, c = x.size()
+        reg_loss = torch.tensor(0).float().to(gl.device)
+        thred = 5
+        margin = 2
+        weight, inverse_weight = self.get_W(x, thred)
+        for i in range(n):
+            dist = euclidean_dist(x[i, :, :], x[i, :, :]) # t * t
+            inverse_dist = torch.max(torch.zeros(t, t).to(gl.device), margin - dist).to(gl.device)
+            reg_loss += (inverse_dist * inverse_weight + dist * weight).sum()
+        return reg_loss / n
+    def forward(self, x):
+        x = self.model(x)
+        return x
--- a/prototypical_batch_sampler.py
+++ b/prototypical_batch_sampler.py
+# coding=utf-8
+import numpy as np
+import torch
+class PrototypicalBatchSampler(object):
+    '''
+    PrototypicalBatchSampler: yield a batch of indexes at each iteration.
+    Indexes are calculated by keeping in account 'classes_per_it' and 'num_samples',
+    In fact at every iteration the batch indexes will refer to  'num_support' + 'num_query' samples
+    for 'classes_per_it' random classes.
+    __len__ returns the number of episodes per epoch (same as 'self.iterations').
+    '''
+    def __init__(self, labels, classes_per_it, num_samples, iterations):
+        '''
+        Initialize the PrototypicalBatchSampler object
+        Args:
+        - labels: an iterable containing all the labels for the current dataset
+        samples indexes will be infered from this iterable.
+        - classes_per_it: number of random classes for each iteration
+        - num_samples: number of samples for each iteration for each class (support + query)
+        - iterations: number of iterations (episodes) per epoch
+        '''
+        super(PrototypicalBatchSampler, self).__init__()
+        self.labels = labels
+        # print(labels,len(labels))
+        self.classes_per_it = classes_per_it
+        self.sample_per_class = num_samples
+        self.iterations = iterations
+        self.classes, self.counts = np.unique(self.labels, return_counts=True)
+        self.classes = torch.LongTensor(self.classes)
+        # create a matrix, indexes, of dim: classes X max(elements per class)
+        # fill it with nans
+        # for every class c, fill the relative row with the indices samples belonging to c
+        # in numel_per_class we store the number of samples for each class/row
+        self.idxs = range(len(self.labels))
+        self.indexes = np.empty((len(self.classes), max(self.counts)), dtype=int) * np.nan
+        self.indexes = torch.Tensor(self.indexes)
+        self.numel_per_class = torch.zeros_like(self.classes)
+        for idx, label in enumerate(self.labels):
+            # print((self.classes == label).numpy().astype(int))
+            label_idx = np.argwhere((self.classes == label).numpy().astype(int)).item()
+            # print(label_idx)
+            self.indexes[label_idx, np.where(np.isnan(self.indexes[label_idx]))[0][0]] = idx
+            self.numel_per_class[label_idx] += 1
+    def __iter__(self):
+        '''
+        yield a batch of indexes
+        '''
+        spc = self.sample_per_class
+        cpi = self.classes_per_it
+        for it in range(self.iterations):
+            batch_size = spc * cpi
+            batch = torch.LongTensor(batch_size)
+            c_idxs = torch.randperm(len(self.classes))[:cpi]
+            for i, c in enumerate(self.classes[c_idxs]):
+                s = slice(i * spc, (i + 1) * spc)
+                # FIXME when torch.argwhere will exists
+                label_idx = torch.arange(len(self.classes)).long()[self.classes == c].item()
+                sample_idxs = torch.randperm(self.numel_per_class[label_idx])[:spc]
+                batch[s] = self.indexes[label_idx][sample_idxs]
+            batch = batch[torch.randperm(len(batch))]
+            yield batch
+    def __len__(self):
+        '''
+        returns the number of iterations (episodes) per epoch
+        '''
+        return self.iterations
--- a/prototypical_loss.py
+++ b/prototypical_loss.py
+# coding=utf-8
+import torch
+from torch.nn import functional as F
+from torch.nn.modules import Module
+class PrototypicalLoss(Module):
+    '''
+    Loss class deriving from Module for the prototypical loss function defined below
+    '''
+    def __init__(self, n_support):
+        super(PrototypicalLoss, self).__init__()
+        self.n_support = n_support
+    def forward(self, input, target):
+        return prototypical_loss(input, target, self.n_support)
+def euclidean_dist(x, y):
+    '''
+    Compute euclidean distance between two tensors
+    '''
+    # x: N x D
+    # y: M x D
+    n = x.size(0)
+    m = y.size(0)
+    d = x.size(1)
+    if d != y.size(1):
+        raise Exception
+    x = x.unsqueeze(1).expand(n, m, d)
+    y = y.unsqueeze(0).expand(n, m, d)
+    return torch.pow(x - y, 2).sum(2)
+def prototypical_loss(input, target, n_support):
+    '''
+    Inspired by https://github.com/jakesnell/prototypical-networks/blob/master/protonets/models/few_shot.py
+    Compute the barycentres by averaging the features of n_support
+    samples for each class in target, computes then the distances from each
+    samples' features to each one of the barycentres, computes the
+    log_probability for each n_query samples for each one of the current
+    classes, of appartaining to a class c, loss and accuracy are then computed
+    and returned
+    Args:
+    - input: the model output for a batch of samples
+    - target: ground truth for the above batch of samples
+    - n_support: number of samples to keep in account when computing
+      barycentres, for each one of the current classes
+    '''
+    target_cpu = target.to('cpu')
+    input_cpu = input.to('cpu')
+    def supp_idxs(c):
+        # FIXME when torch will support where as np
+        return torch.nonzero(target_cpu.eq(c), as_tuple=False)[:n_support].squeeze(1)
+    # FIXME when torch.unique will be available on cuda too
+    classes = torch.unique(target_cpu)
+    n_classes = len(classes)
+    # FIXME when torch will support where as np
+    # assuming n_query, n_target constants
+    n_query = target_cpu.eq(classes[0].item()).sum().item() - n_support
+    support_idxs = list(map(supp_idxs, classes))
+    prototypes = torch.stack([input_cpu[idx_list].mean(0) for idx_list in support_idxs])
+    # FIXME when torch will support where as np
+    query_idxs = torch.stack(list(map(lambda c: torch.nonzero(target_cpu.eq(c), as_tuple=False)[n_support:], classes))).view(-1)
+    query_samples = input.to('cpu')[query_idxs]
+    dists = euclidean_dist(query_samples, prototypes)
+    log_p_y = F.log_softmax(-dists, dim=1).view(n_classes, n_query, -1)
+    target_inds = torch.arange(0, n_classes)
+    target_inds = target_inds.view(n_classes, 1, 1)
+    target_inds = target_inds.expand(n_classes, n_query, 1).long()
+    loss_val = -log_p_y.gather(2, target_inds).squeeze().view(-1).mean()
+    _, y_hat = log_p_y.max(2)
+    acc_val = y_hat.eq(target_inds.squeeze()).float().mean()
+    return loss_val,  acc_val
--- a/soft_dtw.py
+++ b/soft_dtw.py
+import numpy as np
+import torch
+from numba import jit
+from torch.autograd import Function
+import gl
+@jit(nopython = True)
+def compute_softdtw(D, gamma):
+  B = D.shape[0]
+  N = D.shape[1]
+  M = D.shape[2]
+  R = np.ones((B, N + 2, M + 2)) * np.inf
+  R[:, 0, 0] = 0
+  for k in range(B):
+    for j in range(1, M + 1):
+      for i in range(1, N + 1):
+        r0 = -R[k, i - 1, j - 1] / gamma
+        r1 = -R[k, i - 1, j] / gamma
+        r2 = -R[k, i, j - 1] / gamma
+        rmax = max(max(r0, r1), r2)
+        rsum = np.exp(r0 - rmax) + np.exp(r1 - rmax) + np.exp(r2 - rmax)
+        softmin = - gamma * (np.log(rsum) + rmax)
+        R[k, i, j] = D[k, i - 1, j - 1] + softmin
+  return R
+@jit(nopython = True)
+def compute_softdtw_backward(D_, R, gamma):
+  B = D_.shape[0]
+  N = D_.shape[1]
+  M = D_.shape[2]
+  D = np.zeros((B, N + 2, M + 2))
+  E = np.zeros((B, N + 2, M + 2))
+  D[:, 1:N + 1, 1:M + 1] = D_
+  E[:, -1, -1] = 1
+  R[:, : , -1] = -np.inf
+  R[:, -1, :] = -np.inf
+  R[:, -1, -1] = R[:, -2, -2]
+  for k in range(B):
+    for j in range(M, 0, -1):
+      for i in range(N, 0, -1):
+        a0 = (R[k, i + 1, j] - R[k, i, j] - D[k, i + 1, j]) / gamma
+        b0 = (R[k, i, j + 1] - R[k, i, j] - D[k, i, j + 1]) / gamma
+        c0 = (R[k, i + 1, j + 1] - R[k, i, j] - D[k, i + 1, j + 1]) / gamma
+        a = np.exp(a0)
+        b = np.exp(b0)
+        c = np.exp(c0)
+        E[k, i, j] = E[k, i + 1, j] * a + E[k, i, j + 1] * b + E[k, i + 1, j + 1] * c
+  return E[:, 1:N + 1, 1:M + 1]
+class _SoftDTW(Function):
+  @staticmethod
+  def forward(ctx, D, gamma):
+    dev = D.device
+    dtype = D.dtype
+    gamma = torch.Tensor([gamma]).to(dev).type(dtype)  # dtype fixed
+    D_ = D.detach().cpu().numpy()
+    gl.D_ = D_
+    g_ = gamma.item()
+    R = torch.Tensor(compute_softdtw(D_, g_)).to(dev).type(dtype)
+    gl.R_ = R.detach().cpu().numpy()
+    ctx.save_for_backward(D, R, gamma)
+    return R[:, -2, -2]
+  @staticmethod
+  def backward(ctx, grad_output):
+    dev = grad_output.device
+    dtype = grad_output.dtype
+    D, R, gamma = ctx.saved_tensors
+    D_ = D.detach().cpu().numpy()
+    R_ = R.detach().cpu().numpy()
+    g_ = gamma.item()
+    E = torch.Tensor(compute_softdtw_backward(D_, R_, g_)).to(dev).type(dtype)
+    return grad_output.view(-1, 1, 1).expand_as(E) * E, None
+class SoftDTW(torch.nn.Module):
+  def __init__(self, gamma=1.0, normalize=False, attention=None, attention_y=None):
+    super(SoftDTW, self).__init__()
+    self.normalize = normalize
+    self.gamma = gamma
+    self.func_dtw = _SoftDTW.apply
+    self.attention = attention
+    self.attention_y = attention_y
+    if attention != None:
+      self.calc_matrix_func = self.attention_calc_distance_matrix
+    else:
+      self.calc_matrix_func = self.calc_distance_matrix
+  def attention_calc_distance_matrix(self, x, y):
+    n, t, v, c = x.size()
+    x = x.view(n * t, v, c)
+    y = y.view(n * t, v, c)
+    # print("x,y",x.shape,y.shape)
+    attention_x = self.attention(x, y)
+    attention_y = self.attention_y(y, x)
+    attention_x = attention_x.view(n, t, -1)
+    attention_y = attention_y.view(n, t, -1)
+    # attention_x = attention_x.unsqueeze(2).expand(n, t, t, -1)
+    # attention_y = attention_y.unsqueeze(1).expand(n, t, t, -1)
+    # dist = torch.pow(attention_x - attention_y, 2).sum(3)
+    return self.calc_distance_matrix(attention_x,attention_y)
+  def calc_distance_matrix(self, x, y):
+    n = x.size(1)
+    m = y.size(1)
+    d = x.size(2)
+    x = x.unsqueeze(2).expand(-1, n, m, d)
+    y = y.unsqueeze(1).expand(-1, n, m, d)
+    x = x.reshape(-1, d)
+    y = y.reshape(-1, d)
+    x = x / (x.norm(dim=1, keepdim=True) + 1e-8)
+    y = y / (y.norm(dim=1, keepdim=True) + 1e-8)
+    # e_cos=torch.matmul(x,y.transpose(0,1))
+    cos = x * y
+    e_cos = cos.sum(1)
+    # e_cos = torch.exp(sum_cos)
+    e_cos = e_cos.view(-1, n, m)
+    # dist = e_cos
+    # dist = torch.pow(x - y, 2).sum(3)
+    # print(1-e_cos)
+    return 1-e_cos
+  def forward(self, x, y):
+    assert len(x.shape) == len(y.shape)
+    squeeze = False
+    if len(x.shape) < 3:
+      x = x.unsqueeze(0)
+      y = y.unsqueeze(0)
+      squeeze = True
+    if self.normalize:
+      D_xx = self.calc_matrix_func(x, x)
+      out_xx = self.func_dtw(D_xx, self.gamma)
+      D_yy = self.calc_matrix_func(y, y)
+      out_yy = self.func_dtw(D_yy, self.gamma)
+      D_xy = self.calc_matrix_func(x, y)
+      out_xy = self.func_dtw(D_xy, self.gamma)
+      result = out_xy - 1/2 * (out_xx + out_yy)  # distance
+    else:
+      D_xy = self.calc_matrix_func(x, y)
+      out_xy = self.func_dtw(D_xy, self.gamma)
+      result = out_xy  # discrepancy
+    gl.iter += 1
+    import os
+    # save_dir = '{}/R_'.format(gl.experiment_root)
+    # if not os.path.exists(save_dir):
+    #   os.mkdir(save_dir)
+    # save_dir_D = '{}/D_'.format(gl.experiment_root)
+    # if not os.path.exists(save_dir_D):
+    #   os.mkdir(save_dir_D)
+    # if gl.iter % 100 == 0 and gl.mod == 'val':
+    #   np.save(os.path.join(save_dir, 'epoch{}_iter_{}.npy'.format(gl.epoch, gl.iter)), gl.R_)
+    #   np.save(os.path.join(save_dir_D, 'epoch{}_iter_{}.npy'.format(gl.epoch, gl.iter)), gl.D_)
+    # if gl.epoch == 0 and gl.iter <= 100:
+    #   np.save(os.path.join(save_dir, 'epoch{}_iter_{}.npy'.format(gl.epoch, gl.iter)), gl.R_)
+    #   np.save(os.path.join(save_dir_D, 'epoch{}_iter_{}.npy'.format(gl.epoch, gl.iter)), gl.D_)
+    return result.squeeze(0) if squeeze else result
--- a/train.py
+++ b/train.py
+# coding=utf-8
+import random
+from prototypical_batch_sampler import PrototypicalBatchSampler
+from prototypical_loss import prototypical_loss as loss_fn
+from nturgbd_dataset import NTU_RGBD_Dataset
+from protonet import ProtoNet
+from parser_util import get_parser
+from utils import load_data, get_para_num, setup_seed,getAvaliableDevice
+from tqdm import tqdm
+import numpy as np
+import torch
+import pickle
+import os
+import time
+import gl
+import warnings
+from utils import *
+def init_seed(opt):
+    '''
+    Disable cudnn to maximize reproducibility
+    '''
+    torch.cuda.cudnn_enabled = False
+    np.random.seed(opt.manual_seed)
+    torch.manual_seed(opt.manual_seed)
+    torch.cuda.manual_seed(opt.manual_seed)
+def init_dataset(opt, data_list, mode):
+    # print('not extract frame')
+    # opt.extract_frame = 0
+    debug = False
+    dataset = NTU_RGBD_Dataset(mode=mode, data_list=data_list, debug=debug, extract_frame=opt.extract_frame)
+    n_classes = len(np.unique(dataset.label))
+    if n_classes < opt.classes_per_it_tr or n_classes < opt.classes_per_it_val:
+        raise(Exception('There are not enough classes in the dataset in order ' +
+                        'to satisfy the chosen classes_per_it. Decrease the ' +
+                        'classes_per_it_{tr/val} option and try again.'))
+    return dataset
+def init_sampler(opt, labels, mode):
+    if 'train' in mode:
+        classes_per_it = opt.classes_per_it_tr
+        num_samples = opt.num_support_tr + opt.num_query_tr
+        iters = opt.train_iterations
+    else:
+        classes_per_it = opt.classes_per_it_val
+        num_samples = opt.num_support_val + opt.num_query_val
+        iters = opt.test_iterations
+    return PrototypicalBatchSampler(labels=labels,
+                                    classes_per_it=classes_per_it,
+                                    num_samples=num_samples,
+                                    iterations=iters)
+def init_dataloader(opt, data_list, mode):
+    dataset = init_dataset(opt, data_list, mode)
+    sampler = init_sampler(opt, dataset.label, mode)
+    dataloader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, num_workers=4)
+    return dataloader
+def init_protonet(opt):
+    '''
+    Initialize the ProtoNet
+    '''
+    model = ProtoNet(opt).to(gl.device)
+    if opt.model == 1:
+        model_path = os.path.join(opt.experiment_root, 'best_model.pth')
+        # print('model_path', model_path)
+        model.load_state_dict(torch.load(model_path))
+    # print(get_para_num(model))
+    return model
+def init_optim(opt, model):
+    '''
+    Initialize optimizer
+    '''
+    # optimizer = torch.optim.SGD(model.parameters(), lr=opt.learning_rate, momentum=0.9, weight_decay=5e-4, nesterov=True)
+    optimizer = torch.optim.Adam(params=model.parameters(), lr=opt.learning_rate, weight_decay=5e-4)
+    return optimizer
+def init_lr_scheduler(opt, optim):
+    '''
+    Initialize the learning rate scheduler
+    '''
+    if opt.lr_flag == 'reduceLR':
+        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, mode='min', factor=0.5, patience=10, verbose=True, min_lr=1e-5)
+    elif opt.lr_flag == 'stepLR':
+        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optim, gamma=opt.lr_scheduler_gamma,
+                                                       step_size=opt.lr_scheduler_step)
+    return lr_scheduler
+def save_list_to_file(path, thelist):
+    with open(path, 'w') as f:
+        for item in thelist:
+            f.write("%s\n" % item)
+def cosine(x, str):
+    if str == 'not_encoder':
+        t_path = os.path.join(gl.experiment_root, 'origin_t')
+        n, c, t, v, m = x.size()
+        x = x.mean(4)
+    else :
+        t_path = os.path.join(gl.experiment_root, 't')
+        n, c, t, v = x.size()
+    for i in range(t - 1):
+        if not os.path.exists(t_path):
+            os.mkdir(t_path)
+        f_path = os.path.join(t_path, '{}_{}.txt'.format(i, i + 1))
+        t1, t2 = torch.transpose(x[0, :, i, :], 1, 0), torch.transpose(x[0, :, i + 1, :], 1, 0)
+        t1 = t1 / (t1.norm(dim=1, keepdim=True) + 1e-8)
+        t2 = t2 / (t2.norm(dim=1, keepdim=True) + 1e-8)
+        cos = torch.mm(t1, torch.transpose(t2, 1, 0))
+        # print(cos)
+        np.savetxt(f_path, cos.cpu().detach().numpy(), fmt='%.2f')
+        # print('--------------------')
+    t1, t2 = torch.transpose(x[0, :, 0, :], 1, 0), torch.transpose(x[0, :, t - 1, :], 1, 0)
+    t1 = t1 / (t1.norm(dim=1, keepdim=True) + 1e-8)
+    t2 = t2 / (t2.norm(dim=1, keepdim=True) + 1e-8)
+    cos = torch.mm(t1, torch.transpose(t2, 1, 0))
+    # print(cos)
+    f_path = os.path.join(t_path, '{}_{}.txt'.format(0, t - 1))
+    np.savetxt(f_path, cos.cpu().detach().numpy(), fmt='%.2f')
+def train(opt, tr_dataloader, model, optim, lr_scheduler, val_dataloader=None, test_dataloader=None):
+    '''
+    Train the model with the prototypical learning algorithm
+    '''
+    import json
+    with open(os.path.join(opt.experiment_root, 'opt.json'), 'w') as f:
+        j = vars(opt)
+        json.dump(j, f)
+        f.write('\n')
+    if val_dataloader is None:
+        best_state = None
+    best_acc = 0
+    last_acc = 0
+    acc_reduce_num = 0
+    best_model_path = os.path.join(opt.experiment_root, 'best_model.pth')
+    last_model_path = os.path.join(opt.experiment_root, 'last_model.pth')
+    trace_file = os.path.join(opt.experiment_root, 'trace.txt')
+    start_epoch = 0
+    patience=0
+    for epoch in range(start_epoch, opt.epochs):
+        gl.epoch = epoch
+        gl.iter = 0
+        # print('=== Epoch: {} ==='.format(epoch))
+        tr_iter = iter(tr_dataloader)
+        model.train()
+        lr = opt.learning_rate
+        train_acc = []
+        reg_loss = []
+        train_loss = []
+        for batch in tqdm(tr_iter):
+        # for batch in tr_iter:
+            optim.zero_grad()
+            gl.mod = 'train'
+            x, y = batch
+            x, y = x.to(gl.device).float(), y.to(gl.device)
+            model_output = model(x)
+            loss, acc, reg = model.loss(model_output, y, opt.num_support_tr,opt.dtw)
+            train_loss.append(loss.item())
+            train_acc.append(acc.item())
+            reg_loss.append(reg.item())
+            loss.backward()
+            optim.step()
+        avg_loss = np.mean(train_loss)
+        avg_reg = np.mean(reg_loss)
+        avg_acc = np.mean(train_acc)
+        t_loss, t_acc = avg_loss, avg_acc
+        string = 'train loss: {}, classfier loss:{} reg loss: {}, train Acc: {}'.format(avg_loss, avg_loss - avg_reg, avg_reg, avg_acc)
+        if opt.lr_flag == 'reduceLR':
+            lr_scheduler.step(avg_loss)
+        elif opt.lr_flag == 'stepLR':
+            lr_scheduler.step()
+        lr = optim.state_dict()['param_groups'][0]['lr']
+        if val_dataloader is None:
+            continue
+        val_iter = iter(val_dataloader)
+        model.eval()
+        val_loss = []
+        val_acc = []
+        for batch in tqdm(val_iter):
+        # for batch in val_iter:
+            x, y = batch
+            x, y = x.to(gl.device).float(), y.to(gl.device)
+            gl.mod = 'val'
+            model_output = model(x)
+            loss, acc, reg = model.loss(model_output, target=y, n_support=opt.num_support_val,dtw=opt.dtw)
+            val_loss.append(loss.item())
+            val_acc.append(acc.item())
+        avg_loss = np.mean(val_loss)
+        avg_acc = np.mean(val_acc)
+        # if acc reduce 10 times, break
+        if last_acc == 0:
+            last_acc = avg_acc
+        else:
+            if last_acc >= avg_acc:
+                acc_reduce_num += 1
+            else:
+                acc_reduce_num = 0
+            last_acc = avg_acc
+        if acc_reduce_num >= 10:
+            print('acc already reduce more than 10 times!!  end training...')
+            break
+        v_loss, v_acc = avg_loss, avg_acc
+        postfix = ' (Best)' if avg_acc >= best_acc else ' (Best: {})'.format(best_acc)
+        string_val = 'val loss: {}, val acc: {}{} lr:{}'.format(avg_loss, avg_acc, postfix, lr)
+        print(string + '\t' + string_val)
+        with open(trace_file, 'a') as f:
+            f.write(string + '\t' + string_val)
+            f.write('\n')
+        if avg_acc >= best_acc:
+            torch.save(model.state_dict(), best_model_path)
+            patience=0
+            best_acc = avg_acc
+            best_state = model.state_dict()
+        else :
+            patience+=1
+        if patience >40:
+            break
+    torch.save(model.state_dict(), last_model_path)
+    return best_state, best_acc
+def test(opt, test_dataloader, model):
+    '''
+    Test the model trained with the prototypical learning algorithm
+    '''
+    print('testing model...')
+    avg_acc = list()
+    trace_file = os.path.join(opt.experiment_root, 'test.txt')
+    n_class_val, n_query_val = opt.classes_per_it_val, opt.num_query_val
+    for epoch in range(10):
+        # print('=== Epoch: {} ==='.format(epoch))
+        model.eval()
+        gl.epoch = epoch
+        test_iter = iter(test_dataloader)
+        for batch in test_iter:
+            x, y = batch
+            x, y = x.to(gl.device).float(), y.to(gl.device)
+            model_output = model(x)
+            _, acc, _ = model.loss(model_output, target=y, n_support=opt.num_support_val,dtw=opt.dtw)
+            avg_acc.append(acc.item())
+        # print('test avg_acc', np.mean(avg_acc))
+    avg_acc = np.mean(avg_acc)
+    with open(trace_file, 'a') as f:
+        f.write('test acc: {}'.format(avg_acc))
+        f.write('\n')
+    print('Test Acc: {}'.format(avg_acc))
+    return avg_acc
+def eval(opt):
+    '''
+    Initialize everything and train
+    '''
+    options = get_parser().parse_args()
+    if torch.cuda.is_available() and not options.cuda:
+        print("WARNING: You have a CUDA device, so you should probably run with --cuda")
+    init_seed(options)
+    test_dataloader = init_dataset(options)[-1]
+    model = init_protonet(options)
+    model_path = os.path.join(opt.experiment_root, 'best_model.pth')
+    model.load_state_dict(torch.load(model_path))
+    test(opt=options,
+         test_dataloader=test_dataloader,
+         model=model)
+def main():
+    '''
+    Initialize everything and train
+    '''
+    options = get_parser().parse_args()
+    options.experiment_root=os.path.join(options.experiment_root, "seed_"+str(str(options.manual_seed)),
+    "_dataset"+str(options.dataset),"_back"+str(options.backbone),"_reg"+str(options.reg_rate)+"_att"+str(options.SA)+"_dtw"+str(options.dtw))
+    options.cuda=True
+    options.device=str(1)
+    if options.debug == 1:
+        gl.debug = True
+    device = 'cuda:{}'.format(options.device) if torch.cuda.is_available() and options.cuda else 'cpu'
+    gl.device = device
+    # print("device",device)
+    gl.gamma = options.gamma
+    options.experiment_root = "../log/"+options.experiment_root
+    gl.experiment_root=options.experiment_root
+    gl.reg_rate = options.reg_rate
+    gl.threshold = options.thred
+    gl.backbone = options.backbone
+    gl.dataset = options.dataset
+    gl.SA = options.SA
+    if not os.path.exists(gl.experiment_root):
+        os.makedirs(gl.experiment_root)
+    if torch.cuda.is_available() and not options.cuda:
+        print("WARNING: You have a CUDA device, so you should probably run with --cuda")
+    init_seed(options)
+    setup_seed(options.manual_seed)
+    data_list = []
+    tr_dataloader = init_dataloader(options, data_list, 'train')
+    val_dataloader = init_dataloader(options, data_list, 'val')
+    test_dataloader = init_dataloader(options, data_list, 'test')
+    model = init_protonet(options)
+    optim = init_optim(options, model)
+    lr_scheduler = init_lr_scheduler(options, optim)
+    if options.mode == 'train':
+        res = train(opt=options,
+                    tr_dataloader=tr_dataloader,
+                    val_dataloader=val_dataloader,
+                    test_dataloader=test_dataloader,
+                    model=model,
+                    optim=optim,
+                    lr_scheduler=lr_scheduler)
+        best_state, best_acc = res
+        # print('Testing with last model..')
+        # test(opt=options,
+        #      test_dataloader=test_dataloader,
+        #      model=model)
+        model.load_state_dict(best_state)
+        model_path = os.path.join(options.experiment_root, 'best_model.pth')
+        model.load_state_dict(torch.load(model_path))
+        print('Testing with best model..')
+        test(opt=options,
+             test_dataloader=test_dataloader,
+             model=model)
+    elif options.mode == 'test':
+        print('Testing with best model..')
+        test(opt=options,
+             test_dataloader=
+             test_dataloader,
+             model=model)
+if __name__ == '__main__':
+    main()
--- a/utils.py
+++ b/utils.py
+import pickle
+import csv
+import fcntl
+import torch
+import time
+import os
+import numpy as np
+import random
+import gl
+from soft_dtw import SoftDTW
+import pynvml,time
+def getAvaliableDevice(gpu=[0],min_mem=24000,left=False):
+# def getAvaliableDevice(gpu=[6],min_mem=10000,left=False):
+    """
+    :param gpu:
+    :param min_mem:
+    :param left:
+    :return:
+    """
+    pynvml.nvmlInit()
+    t=int(time.strftime("%H", time.localtime()))
+    if t>=23 or t <8:
+        left=False # do not leave any GPUs
+    #else:
+        #left=True
+    min_num=3
+    dic = {0: 0,   1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6:6, 7:7, -1: -1}  # just for 207 server
+    ava_gpu = -1
+    while ava_gpu == -1:
+        avaliable=[]
+        for i in gpu:
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+            meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
+            utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
+            if (meminfo.free / 1024 ** 2)>min_mem and utilization.gpu<10:
+                avaliable.append(dic[i])
+        if len(avaliable)==0 or (left and len(avaliable)<=1):
+            # if len(avaliable)==1:
+            #     if avaliable[0] not in [4,5,6]:
+            #         ava_gpu= -1
+            #         time.sleep(5)
+            #         continue
+            # else :
+            ava_gpu = -1
+            time.sleep(20)
+            continue
+        ava_gpu= avaliable[0]
+    return ava_gpu
+def write_shared_file(file_name,content):
+    nowtime=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
+    content[0]=nowtime+" "+content[0]
+    with open(file_name,'a+') as f:
+        fcntl.flock(f,fcntl.LOCK_EX)
+        f.writelines(content)
+        fcntl.flock(f,fcntl.LOCK_UN)
+def write_csv_file(file_name,content):
+    nowtime=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
+    content["time"]=nowtime
+    to_write_head = False
+    if not os.path.exists(file_name):
+        to_write_head=True
+    with open(file_name,'a+') as f:
+        writer=csv.DictWriter(f,content.keys())
+        fcntl.flock(f,fcntl.LOCK_EX)
+        if to_write_head:
+            writer.writeheader()
+        writer.writerow(content)
+        # for key, value in content.items:
+        #     writer.writerow([key, value])
+        fcntl.flock(f,fcntl.LOCK_UN)
+def get_para_num(net):
+    total_num = sum(p.numel() for p in net.parameters())
+    trainable_num = sum(p.numel() for p in net.parameters() if p.requires_grad)
+    return {'Total': total_num, 'Trainable': trainable_num}
+def setup_seed(seed=0):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    # np.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+def euclidean_dist(x, y):
+    # x: N x D
+    # y: M x D
+    n = x.size(0)
+    m = y.size(0)
+    d = x.size(1)
+    assert d == y.size(1)
+    x = x.unsqueeze(1).expand(n, m, d)
+    y = y.unsqueeze(0).expand(n, m, d)
+    return torch.pow(x - y, 2).sum(2)
+def get_support_query_data(support, query, device):
+    '''
+    :param support:[n_class, c, v]
+    :param query: [n_class * n_query, c, v]
+    :return: sq: [n_class * (n_class * n_query) * 2, c, v]
+    '''
+    n_class, c, v = support.size()
+    all_query = query.size(0)
+    sum_matching_graph = n_class * all_query * 2
+    node_features = torch.zeros(sum_matching_graph, c, v).to(device)
+    idx, idx2= torch.arange(0, sum_matching_graph, 2).to(device), torch.arange(1, sum_matching_graph, 2).to(device)
+    node_features[idx] = query.unsqueeze(1).repeat(1, n_class, 1, 1).reshape(-1, c, v)
+    node_features[idx2] = support.unsqueeze(0).repeat(all_query, 1, 1, 1).reshape(-1, c, v)
+    node_features = node_features.permute(0, 2, 1).reshape(sum_matching_graph * v, c)
+    return node_features
+def euclidean_distance(x, y):
+    """This is the squared Euclidean distance."""
+    return torch.sum((x - y) ** 2, dim=-1)
+def compute_similarity(x, y):
+    """Compute the distance between x and y vectors.
+    The distance will be computed based on the training loss type.
+    Args:
+      config: a config dict.
+      x: [n_examples, feature_dim] float tensor.
+      y: [n_examples, feature_dim] float tensor.
+    Returns:
+      dist: [n_examples] float tensor.
+    Raises:
+      ValueError: if loss type is not supported.
+    """
+    return -euclidean_distance(x, y)
+def extract_k_segement(x, num_frame, segement):
+    n, c, t, v = x.size()
+    assert n == len(num_frame)
+    step = num_frame // segement
+    new_x = []
+    for i in range(n):
+        idx = [ random.randint(j * step[i], (j + 1) * step[i] - 1) for j in range(segement)]
+        new_x.append(x[i, :, idx, :].unsqueeze(0))
+    new_x = torch.cat(new_x, dim=0)
+    return new_x
+def load_data(path, train_class_name, val_class_name, test_class_name):
+    data_path = os.path.join(path, 'train_data.npy')
+    label_path = os.path.join(path, 'train_label.pkl')
+    # num_frame_path = os.path.join(path, 'train_num_frame.npy')
+    num_class = np.zeros(125)
+    try:
+        with open(label_path) as f:
+            sample_name, label = pickle.load(f)
+    except:
+        # for pickle file from python2
+        with open(label_path, 'rb') as f:
+            sample_name, label = pickle.load(f, encoding='latin1')
+    # load data
+    data = np.load(data_path)
+    # num_frame = np.load(num_frame_path)
+    num_frame = np.ones(len(label)) * 300
+    train_data, val_data, test_data = [], [], []
+    train_label, val_label, test_label = [], [], []
+    train_num_frame, val_num_frame, test_num_frame = [], [], []
+    for i in range(len(label)):
+        if label[i] > 120 :
+            continue
+        num_class[label[i]] += 1
+        if label[i] in train_class_name:
+            if num_class[label[i]] >= 500:
+                continue
+            train_data.append(np.expand_dims(data[i], axis=0))
+            train_label.append(label[i])
+            train_num_frame.append(num_frame[i])
+        elif label[i] in val_class_name:
+            if num_class[label[i]] >= 100:
+                continue
+            val_data.append(np.expand_dims(data[i], axis=0))
+            val_label.append(label[i])
+            val_num_frame.append(num_frame[i])
+        elif label[i] in test_class_name:
+            if num_class[label[i]] >= 100:
+                continue
+            test_data.append(np.expand_dims(data[i], axis=0))
+            test_label.append(label[i])
+            test_num_frame.append(num_frame[i])
+    train_data, val_data, test_data = np.concatenate(train_data, 0), np.concatenate(val_data, 0), np.concatenate(test_data, 0)
+    save_path = '/mnt/data1/kinetics-skeleton/train_500_val_100'
+    np.save(os.path.join(save_path, 'train_data.npy'), train_data)
+    np.save(os.path.join(save_path, 'train_label.npy'), train_label)
+    np.save(os.path.join(save_path, 'train_frame.npy'), train_num_frame)
+    np.save(os.path.join(save_path, 'val_data.npy'), val_data)
+    np.save(os.path.join(save_path, 'val_label.npy'), val_label)
+    np.save(os.path.join(save_path, 'val_frame.npy'), val_num_frame)
+    np.save(os.path.join(save_path, 'test_data.npy'), test_data)
+    np.save(os.path.join(save_path, 'test_label.npy'), test_label)
+    np.save(os.path.join(save_path, 'test_frame.npy'), test_num_frame)
+    data_list = [train_data, train_label, np.array(train_num_frame), val_data, val_label, np.array(val_num_frame), test_data, test_label, np.array(test_num_frame)]
+    return data_list
+def import_class(name):
+    components = name.split('.')
+    mod = __import__(components[0])
+    for comp in components[1:]:
+        mod = getattr(mod, comp)
+    return mod
+def count_params(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+if __name__ == "__main__":
+    a = 0
\ No newline at end of file