From 7a6f5821bd6615eabb6bc91ba9169828eb00243a Mon Sep 17 00:00:00 2001
From: Hoel Bagard <hoel.bagard@gmail.com>
Date: Thu, 21 Jan 2021 16:10:10 +0900
Subject: [PATCH 01/12] Introduced the use_batch_norm variable, removed old
 code

---
 layers.py | 41 ++++++++++++++++-------------------------
 1 file changed, 16 insertions(+), 25 deletions(-)

diff --git a/layers.py b/layers.py
index 10df5f2..0d7ae78 100644
--- a/layers.py
+++ b/layers.py
@@ -7,13 +7,6 @@ import torch.nn.functional as F
 from .utils.logger import DummyLogger
 
 
-class LayerInfo():
-    def __init__(self):
-        self.memory = 0.0
-        self.ops = 0.0
-        self.output = 0.0
-
-
 class Layer(nn.Module):
     # Default layer arguments
     ACTIVATION = F.leaky_relu
@@ -27,14 +20,12 @@ class Layer(nn.Module):
     VERBOSE = 0
     LOGGER = DummyLogger()
 
-    def __init__(self, activation, batch_norm):
+    def __init__(self, activation):
         super().__init__()
         self.name = 'Layer'
-        self.info = LayerInfo()
 
         # Preload default
         self.activation = Layer.ACTIVATION if activation == 0 else activation
-        self.batch_norm = Layer.BATCH_NORM if batch_norm is None else batch_norm
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         output = input_data
@@ -46,14 +37,14 @@ class Layer(nn.Module):
 
 
 class Linear(Layer):
-    def __init__(self, in_channels: int, out_channels: int, activation=0, batch_norm=None, **kwargs):
-        super().__init__(activation, batch_norm)
+    def __init__(self, in_channels: int, out_channels: int, activation=0, use_batch_norm: bool = False, **kwargs):
+        super().__init__(activation)
 
         self.fc = nn.Linear(in_channels, out_channels, **kwargs)
         self.batch_norm = nn.BatchNorm1d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
-            track_running_stats=Layer.BATCH_NORM_TRAINING) if self.batch_norm else None
+            track_running_stats=Layer.BATCH_NORM_TRAINING) if use_batch_norm else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         return super().forward(self.fc(input_data))
@@ -61,15 +52,15 @@ class Linear(Layer):
 
 class Conv1d(Layer):
     def __init__(self, in_channels: int, out_channels: int, kernel_size: int = 3,
-                 stride: Union[int, Tuple[int, int]] = 1, activation=0, batch_norm=None, **kwargs):
-        super().__init__(activation, batch_norm)
+                 stride: Union[int, Tuple[int, int]] = 1, activation=0, use_batch_norm: bool = False, **kwargs):
+        super().__init__(activation)
 
         self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride,
                               bias=not self.batch_norm, **kwargs)
         self.batch_norm = nn.BatchNorm1d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
-            track_running_stats=Layer.BATCH_NORM_TRAINING) if self.batch_norm else None
+            track_running_stats=Layer.BATCH_NORM_TRAINING) if use_batch_norm else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         return super().forward(self.conv(input_data))
@@ -77,15 +68,15 @@ class Conv1d(Layer):
 
 class Conv2d(Layer):
     def __init__(self, in_channels: int, out_channels: int, kernel_size: int = 3,
-                 stride: Union[int, Tuple[int, int]] = 1, activation=0, batch_norm=None, **kwargs):
-        super().__init__(activation, batch_norm)
+                 stride: Union[int, Tuple[int, int]] = 1, activation=0, use_batch_norm: bool = False, **kwargs):
+        super().__init__(activation)
 
         self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride,
                               bias=not self.batch_norm, **kwargs)
         self.batch_norm = nn.BatchNorm2d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
-            track_running_stats=not Layer.BATCH_NORM_TRAINING) if self.batch_norm else None
+            track_running_stats=not Layer.BATCH_NORM_TRAINING) if use_batch_norm else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         return super().forward(self.conv(input_data))
@@ -93,15 +84,15 @@ class Conv2d(Layer):
 
 class Conv3d(Layer):
     def __init__(self, in_channels: int, out_channels: int, kernel_size: int = 3,
-                 stride: Union[int, Tuple[int, int]] = 1, activation=0, batch_norm=None, **kwargs):
-        super().__init__(activation, batch_norm)
+                 stride: Union[int, Tuple[int, int]] = 1, activation=0, use_batch_norm: bool = False, **kwargs):
+        super().__init__(activation)
 
         self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride=stride,
                               bias=not self.batch_norm, **kwargs)
         self.batch_norm = nn.BatchNorm3d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
-            track_running_stats=Layer.BATCH_NORM_TRAINING) if self.batch_norm else None
+            track_running_stats=Layer.BATCH_NORM_TRAINING) if use_batch_norm else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         return super().forward(self.conv(input_data))
@@ -109,8 +100,8 @@ class Conv3d(Layer):
 
 class Deconv2d(Layer):
     def __init__(self, in_channels: int, out_channels: int, kernel_size: int = 3,
-                 stride: Union[int, Tuple[int, int]] = 1, activation=0, batch_norm=None, **kwargs):
-        super().__init__(activation, batch_norm)
+                 stride: Union[int, Tuple[int, int]] = 1, activation=0, use_batch_norm: bool = False, **kwargs):
+        super().__init__(activation)
 
         self.deconv = nn.ConvTranspose2d(
             in_channels, out_channels, kernel_size, stride=stride,
@@ -118,7 +109,7 @@ class Deconv2d(Layer):
         self.batch_norm = nn.BatchNorm2d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
-            track_running_stats=not Layer.BATCH_NORM_TRAINING) if self.batch_norm else None
+            track_running_stats=not Layer.BATCH_NORM_TRAINING) if use_batch_norm else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         return super().forward(self.deconv(input_data))

From 54000b6c3405ebd54358f13fb1e26eb107698730 Mon Sep 17 00:00:00 2001
From: Hoel Bagard <hoel.bagard@gmail.com>
Date: Thu, 21 Jan 2021 20:36:22 +0900
Subject: [PATCH 02/12] Fixed default use_batch_norm value

---
 layers.py | 50 +++++++++++++++++++++++++-------------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/layers.py b/layers.py
index 0d7ae78..1f916d0 100644
--- a/layers.py
+++ b/layers.py
@@ -2,49 +2,49 @@ from typing import Union, Tuple
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 
 from .utils.logger import DummyLogger
 
 
 class Layer(nn.Module):
     # Default layer arguments
-    ACTIVATION = F.leaky_relu
+    ACTIVATION = torch.nn.LeakyReLU
+    ACTIVATION_KWARGS = {"negative_slope": 0.1}
 
-    BATCH_NORM = True
+    USE_BATCH_NORM = True
     BATCH_NORM_TRAINING = True
     BATCH_NORM_MOMENTUM = 0.01
 
     IS_TRAINING = False
     METRICS = False
-    VERBOSE = 0
     LOGGER = DummyLogger()
 
-    def __init__(self, activation):
+    def __init__(self, activation, use_batch_norm):
         super().__init__()
         self.name = 'Layer'
 
         # Preload default
         self.activation = Layer.ACTIVATION if activation == 0 else activation
+        self.use_batch_norm = Layer.USE_BATCH_NORM if use_batch_norm is None else use_batch_norm
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         output = input_data
         if self.activation is not None:
             output = self.activation(output)
-        if self.batch_norm is not None:
+        if self.use_batch_norm is not None:
             output = self.batch_norm(output)
         return output
 
 
 class Linear(Layer):
-    def __init__(self, in_channels: int, out_channels: int, activation=0, use_batch_norm: bool = False, **kwargs):
-        super().__init__(activation)
+    def __init__(self, in_channels: int, out_channels: int, activation=0, use_batch_norm: bool = None, **kwargs):
+        super().__init__(activation, use_batch_norm)
 
         self.fc = nn.Linear(in_channels, out_channels, **kwargs)
         self.batch_norm = nn.BatchNorm1d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
-            track_running_stats=Layer.BATCH_NORM_TRAINING) if use_batch_norm else None
+            track_running_stats=Layer.BATCH_NORM_TRAINING if Layer.USE_BATCH_NORM else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         return super().forward(self.fc(input_data))
@@ -52,15 +52,15 @@ class Linear(Layer):
 
 class Conv1d(Layer):
     def __init__(self, in_channels: int, out_channels: int, kernel_size: int = 3,
-                 stride: Union[int, Tuple[int, int]] = 1, activation=0, use_batch_norm: bool = False, **kwargs):
-        super().__init__(activation)
+                 stride: Union[int, Tuple[int, int]] = 1, activation=0, use_batch_norm: bool = None, **kwargs):
+        super().__init__(activation, use_batch_norm)
 
         self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride,
-                              bias=not self.batch_norm, **kwargs)
+                              bias=not Layer.USE_BATCH_NORM, **kwargs)
         self.batch_norm = nn.BatchNorm1d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
-            track_running_stats=Layer.BATCH_NORM_TRAINING) if use_batch_norm else None
+            track_running_stats=Layer.BATCH_NORM_TRAINING if Layer.USE_BATCH_NORM else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         return super().forward(self.conv(input_data))
@@ -68,15 +68,15 @@ class Conv1d(Layer):
 
 class Conv2d(Layer):
     def __init__(self, in_channels: int, out_channels: int, kernel_size: int = 3,
-                 stride: Union[int, Tuple[int, int]] = 1, activation=0, use_batch_norm: bool = False, **kwargs):
-        super().__init__(activation)
+                 stride: Union[int, Tuple[int, int]] = 1, activation=0, use_batch_norm: bool = None, **kwargs):
+        super().__init__(activation, use_batch_norm)
 
         self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride,
-                              bias=not self.batch_norm, **kwargs)
+                              bias=not Layer.USE_BATCH_NORM, **kwargs)
         self.batch_norm = nn.BatchNorm2d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
-            track_running_stats=not Layer.BATCH_NORM_TRAINING) if use_batch_norm else None
+            track_running_stats=not Layer.BATCH_NORM_TRAINING if Layer.USE_BATCH_NORM else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         return super().forward(self.conv(input_data))
@@ -84,15 +84,15 @@ class Conv2d(Layer):
 
 class Conv3d(Layer):
     def __init__(self, in_channels: int, out_channels: int, kernel_size: int = 3,
-                 stride: Union[int, Tuple[int, int]] = 1, activation=0, use_batch_norm: bool = False, **kwargs):
-        super().__init__(activation)
+                 stride: Union[int, Tuple[int, int]] = 1, activation=0, use_batch_norm: bool = None, **kwargs):
+        super().__init__(activation, use_batch_norm)
 
         self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride=stride,
-                              bias=not self.batch_norm, **kwargs)
+                              bias=not Layer.USE_BATCH_NORM, **kwargs)
         self.batch_norm = nn.BatchNorm3d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
-            track_running_stats=Layer.BATCH_NORM_TRAINING) if use_batch_norm else None
+            track_running_stats=Layer.BATCH_NORM_TRAINING if Layer.USE_BATCH_NORM else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         return super().forward(self.conv(input_data))
@@ -100,16 +100,16 @@ class Conv3d(Layer):
 
 class Deconv2d(Layer):
     def __init__(self, in_channels: int, out_channels: int, kernel_size: int = 3,
-                 stride: Union[int, Tuple[int, int]] = 1, activation=0, use_batch_norm: bool = False, **kwargs):
-        super().__init__(activation)
+                 stride: Union[int, Tuple[int, int]] = 1, activation=0, use_batch_norm: bool = None, **kwargs):
+        super().__init__(activation, use_batch_norm)
 
         self.deconv = nn.ConvTranspose2d(
             in_channels, out_channels, kernel_size, stride=stride,
-            bias=not self.batch_norm, **kwargs)
+            bias=not Layer.USE_BATCH_NORM, **kwargs)
         self.batch_norm = nn.BatchNorm2d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
-            track_running_stats=not Layer.BATCH_NORM_TRAINING) if use_batch_norm else None
+            track_running_stats=not Layer.BATCH_NORM_TRAINING if Layer.USE_BATCH_NORM else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         return super().forward(self.deconv(input_data))

From a4280a1b78dffcea39ff918dc1731855aacc67e9 Mon Sep 17 00:00:00 2001
From: Hoel Bagard <hoel.bagard@gmail.com>
Date: Fri, 22 Jan 2021 12:38:07 +0900
Subject: [PATCH 03/12] Fixed issues: layers now use self.use_batch_norm
 instead of default value, fixed Layer's forward

---
 layers.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/layers.py b/layers.py
index 1f916d0..3bedb24 100644
--- a/layers.py
+++ b/layers.py
@@ -21,9 +21,8 @@ class Layer(nn.Module):
 
     def __init__(self, activation, use_batch_norm):
         super().__init__()
-        self.name = 'Layer'
-
         # Preload default
+        self.batch_norm: torch.nn._BatchNorm = None
         self.activation = Layer.ACTIVATION if activation == 0 else activation
         self.use_batch_norm = Layer.USE_BATCH_NORM if use_batch_norm is None else use_batch_norm
 
@@ -31,7 +30,8 @@ class Layer(nn.Module):
         output = input_data
         if self.activation is not None:
             output = self.activation(output)
-        if self.use_batch_norm is not None:
+        if self.use_batch_norm:
+            # It is assumed here that if using batch norm, then self.batch_norm has been instanciated.
             output = self.batch_norm(output)
         return output
 
@@ -44,7 +44,7 @@ class Linear(Layer):
         self.batch_norm = nn.BatchNorm1d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
-            track_running_stats=Layer.BATCH_NORM_TRAINING if Layer.USE_BATCH_NORM else None
+            track_running_stats=Layer.BATCH_NORM_TRAINING) if self.use_batch_norm else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         return super().forward(self.fc(input_data))
@@ -60,7 +60,7 @@ class Conv1d(Layer):
         self.batch_norm = nn.BatchNorm1d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
-            track_running_stats=Layer.BATCH_NORM_TRAINING if Layer.USE_BATCH_NORM else None
+            track_running_stats=Layer.BATCH_NORM_TRAINING) if self.use_batch_norm else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         return super().forward(self.conv(input_data))
@@ -92,7 +92,7 @@ class Conv3d(Layer):
         self.batch_norm = nn.BatchNorm3d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
-            track_running_stats=Layer.BATCH_NORM_TRAINING if Layer.USE_BATCH_NORM else None
+            track_running_stats=Layer.BATCH_NORM_TRAINING) if self.use_batch_norm else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         return super().forward(self.conv(input_data))

From ce6314bf5eb824000611deaaeff628150ac404b7 Mon Sep 17 00:00:00 2001
From: Hoel Bagard <hoel.bagard@gmail.com>
Date: Fri, 22 Jan 2021 12:48:33 +0900
Subject: [PATCH 04/12] Fixed bias

---
 layers.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/layers.py b/layers.py
index 3bedb24..aaf219f 100644
--- a/layers.py
+++ b/layers.py
@@ -56,7 +56,7 @@ class Conv1d(Layer):
         super().__init__(activation, use_batch_norm)
 
         self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride,
-                              bias=not Layer.USE_BATCH_NORM, **kwargs)
+                              bias=not self.use_batch_norm, **kwargs)
         self.batch_norm = nn.BatchNorm1d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
@@ -72,7 +72,7 @@ class Conv2d(Layer):
         super().__init__(activation, use_batch_norm)
 
         self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride,
-                              bias=not Layer.USE_BATCH_NORM, **kwargs)
+                              bias=not self.use_batch_norm, **kwargs)
         self.batch_norm = nn.BatchNorm2d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
@@ -88,7 +88,7 @@ class Conv3d(Layer):
         super().__init__(activation, use_batch_norm)
 
         self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride=stride,
-                              bias=not Layer.USE_BATCH_NORM, **kwargs)
+                              bias=not self.use_batch_norm, **kwargs)
         self.batch_norm = nn.BatchNorm3d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
@@ -105,7 +105,7 @@ class Deconv2d(Layer):
 
         self.deconv = nn.ConvTranspose2d(
             in_channels, out_channels, kernel_size, stride=stride,
-            bias=not Layer.USE_BATCH_NORM, **kwargs)
+            bias=not self.use_batch_norm, **kwargs)
         self.batch_norm = nn.BatchNorm2d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,

From 8d13de5711d3cc2db3b73c1d4381bc196f89c65e Mon Sep 17 00:00:00 2001
From: Corentin <corentin-pro@mail.com>
Date: Thu, 29 Apr 2021 19:45:32 +0900
Subject: [PATCH 05/12] Improve ResNet layers

---
 residual.py | 74 ++++++++++++++++++++++-------------------------------
 1 file changed, 30 insertions(+), 44 deletions(-)

diff --git a/residual.py b/residual.py
index bdc14c8..e35a009 100644
--- a/residual.py
+++ b/residual.py
@@ -3,65 +3,51 @@ from typing import Union, Tuple
 import torch
 import torch.nn as nn
 
-from .layers import LayerInfo, Layer
+from .layers import Conv2d, LayerInfo, Layer
 
 
 class ResBlock(Layer):
-    def __init__(self, in_channels: int, out_channels: int, kernel_size: int = 3,
-                 activation=None, **kwargs):
+    def __init__(self, in_channels: int, out_channels: int = -1, kernel_size: int = 3, padding: int = 1,
+                 stride: Union[int, Tuple[int, int]] = 1, activation=None, batch_norm=None, **kwargs):
         super().__init__(activation if activation is not None else 0, False)
+        self.batch_norm = None
+        if out_channels == -1:
+            out_channels = in_channels
 
         self.seq = nn.Sequential(
-            nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, bias=False, **kwargs),
-            nn.BatchNorm2d(
-                out_channels,
-                momentum=Layer.BATCH_NORM_MOMENTUM,
-                track_running_stats=not Layer.BATCH_NORM_TRAINING),
-            torch.nn.LeakyReLU(),
-            nn.Conv2d(out_channels, out_channels, kernel_size=kernel_size, bias=False, padding=1),
-            nn.BatchNorm2d(
-                out_channels,
-                momentum=Layer.BATCH_NORM_MOMENTUM,
-                track_running_stats=not Layer.BATCH_NORM_TRAINING))
-        self.batch_norm = nn.BatchNorm2d(
-            out_channels,
-            momentum=Layer.BATCH_NORM_MOMENTUM,
-            track_running_stats=not Layer.BATCH_NORM_TRAINING) if self.batch_norm else None
+            Conv2d(in_channels, in_channels, kernel_size=kernel_size, stride=stride, padding=padding, **kwargs),
+            Conv2d(in_channels, out_channels, kernel_size=3, padding=1,
+                   activation=None, batch_norm=batch_norm))
+        self.residual = Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, activation=None) if (
+            out_channels != in_channels or stride != 1) else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
+        if self.residual is not None:
+            return super().forward(self.residual(input_data) + self.seq(input_data))
         return super().forward(input_data + self.seq(input_data))
 
 
 class ResBottleneck(Layer):
-    def __init__(self, in_channels: int, out_channels: int, planes: int = 1, kernel_size: int = 3,
-                 stride: Union[int, Tuple[int, int]] = 1, activation=None, **kwargs):
+    def __init__(self, in_channels: int, out_channels: int = -1, bottleneck_channels: int = -1, kernel_size: int = 3,
+                 stride: Union[int, Tuple[int, int]] = 1, padding=1,
+                 activation=None, batch_norm=None, **kwargs):
         super().__init__(activation if activation is not None else 0, False)
         self.batch_norm = None
+        if out_channels == -1:
+            out_channels = in_channels
+        if bottleneck_channels == -1:
+            bottleneck_channels = in_channels // 4
 
         self.seq = nn.Sequential(
-            nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
-            nn.BatchNorm2d(
-                out_channels,
-                momentum=Layer.BATCH_NORM_MOMENTUM,
-                track_running_stats=not Layer.BATCH_NORM_TRAINING),
-            torch.nn.LeakyReLU(),
-            nn.Conv2d(out_channels, out_channels, kernel_size=kernel_size, stride=stride, bias=False, **kwargs),
-            nn.BatchNorm2d(
-                out_channels,
-                momentum=Layer.BATCH_NORM_MOMENTUM,
-                track_running_stats=not Layer.BATCH_NORM_TRAINING),
-            torch.nn.LeakyReLU(),
-            nn.Conv2d(out_channels, planes * out_channels, kernel_size=1, bias=False),
-            nn.BatchNorm2d(
-                out_channels,
-                momentum=Layer.BATCH_NORM_MOMENTUM,
-                track_running_stats=not Layer.BATCH_NORM_TRAINING))
-        self.downsample = nn.Sequential(
-            nn.Conv2d(in_channels, planes * out_channels, stride=stride, kernel_size=1),
-            nn.BatchNorm2d(
-                planes * out_channels,
-                momentum=Layer.BATCH_NORM_MOMENTUM,
-                track_running_stats=not Layer.BATCH_NORM_TRAINING))
+            Conv2d(in_channels, bottleneck_channels, kernel_size=1),
+            Conv2d(bottleneck_channels, bottleneck_channels, kernel_size=kernel_size,
+                   stride=stride, padding=padding, **kwargs),
+            Conv2d(bottleneck_channels, out_channels, kernel_size=1,
+                   activation=None, batch_norm=batch_norm))
+        self.residual = Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, activation=None) if (
+            out_channels != in_channels or stride != 1) else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
-        return super().forward(self.downsample(input_data) + self.seq(input_data))
+        if self.residual is not None:
+            return super().forward(self.residual(input_data) + self.seq(input_data))
+        return super().forward(input_data + self.seq(input_data))

From 092f4acc3b4d11c60b7a0a7954f6ba0588041a05 Mon Sep 17 00:00:00 2001
From: Corentin <corentin-pro@mail.com>
Date: Mon, 17 May 2021 21:05:15 +0900
Subject: [PATCH 06/12] Add SSD

---
 ssd/box.py |  86 ++++++++++++++++++++++++++++
 ssd/ssd.py | 165 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 251 insertions(+)
 create mode 100644 ssd/box.py
 create mode 100644 ssd/ssd.py

diff --git a/ssd/box.py b/ssd/box.py
new file mode 100644
index 0000000..e255fe4
--- /dev/null
+++ b/ssd/box.py
@@ -0,0 +1,86 @@
+import numpy as np
+
+
+def create_box(y_pos: float, x_pos: float, height: float, width: float) -> tuple[float, float, float, float]:
+    y_min, x_min, y_max, x_max = check_rectangle(
+        y_pos - (height / 2), x_pos - (width / 2), y_pos + (height / 2), x_pos + (width / 2))
+    return (y_min + y_max) / 2, (x_min + x_max) / 2, y_max - y_min, x_max - x_min
+
+
+def check_rectangle(y_min: float, x_min: float, y_max: float, x_max: float) -> tuple[float, float, float, float]:
+    if y_min < 0:
+        y_min = 0
+    if x_min < 0:
+        x_min = 0
+    if y_min > 1:
+        y_min = 1
+    if x_min > 1:
+        x_min = 1
+    if y_max < 0:
+        y_max = 0
+    if x_max < 0:
+        x_max = 0
+    if y_max >= 1:
+        y_max = 1
+    if x_max >= 1:
+        x_max = 1
+    return y_min, x_min, y_max, x_max
+
+
+def get_boxes(predictions: np.ndarray, anchors: np.ndarray, class_index: int) -> np.ndarray:
+    boxes = np.zeros(anchors.shape)
+    boxes[:, 0] = (predictions[:, 0] * anchors[:, 2]) + anchors[:, 0]
+    boxes[:, 1] = (predictions[:, 1] * anchors[:, 3]) + anchors[:, 1]
+    boxes[:, 2] = np.exp(predictions[:, 2]) * anchors[:, 2]
+    boxes[:, 3] = np.exp(predictions[:, 3]) * anchors[:, 3]
+    boxes = np.asarray([create_box(*box) for box in boxes])
+
+    # return np.insert(boxes, 4, predictions[:, class_index], axis=-1)
+    return np.concatenate([boxes, predictions[:, class_index:class_index + 1]], axis=1)
+
+
+def fast_nms(boxes: np.ndarray, min_iou: float) -> np.ndarray:
+    # if there are no boxes, return an empty list
+    if len(boxes) == 0:
+        return []
+
+    # initialize the list of picked indexes
+    pick = []
+
+    # grab the coordinates of the bounding boxes
+    y_min = boxes[:, 0] - (boxes[:, 2] / 2)
+    y_max = boxes[:, 0] + (boxes[:, 2] / 2)
+    x_min = boxes[:, 1] - (boxes[:, 3] / 2)
+    x_max = boxes[:, 1] + (boxes[:, 3] / 2)
+    scores = boxes[:, 4]
+
+    # compute the area of the bounding boxes and sort the bounding boxes by the scores
+    areas = (x_max - x_min) * (y_max - y_min)
+    idxs = np.argsort(scores)
+
+    # keep looping while some indexes still remain in the indexes
+    # list
+    while len(idxs) > 0:
+        # grab the last index in the indexes list and add the
+        # index value to the list of picked indexes
+        last = len(idxs) - 1
+        i = idxs[last]
+        pick.append(i)
+
+        inter_tops = np.maximum(y_min[i], y_min[idxs[:last]])
+        inter_bottoms = np.minimum(y_max[i], y_max[idxs[:last]])
+        inter_lefts = np.maximum(x_min[i], x_min[idxs[:last]])
+        inter_rights = np.minimum(x_max[i], x_max[idxs[:last]])
+        inter_areas = (inter_rights - inter_lefts) * (inter_bottoms - inter_tops)
+
+        # compute the ratio of overlap
+        union_area = (areas[idxs[:last]] + areas[i]) - inter_areas
+        overlap = inter_areas / union_area
+
+        # delete all indexes from the index list that have less overlap than min_iou
+        idxs = np.delete(
+            idxs, np.concatenate(([last], np.where(overlap > min_iou)[0])))
+
+    # return only the bounding boxes that were picked using the
+    # integer data type
+    return boxes[pick]
diff --git a/ssd/ssd.py b/ssd/ssd.py
new file mode 100644
index 0000000..eba064d
--- /dev/null
+++ b/ssd/ssd.py
@@ -0,0 +1,165 @@
+import colorsys
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from .box import check_rectangle
+from ..layers import Conv2d
+
+
+class SSD(nn.Module):
+
+    class Detector(nn.Module):
+        def __init__(self, input_features: int, output_features: int):
+            super().__init__()
+            self.conv = Conv2d(input_features, output_features, kernel_size=3, padding=1,
+                               batch_norm=False, activation=None)
+            self.output = None
+
+        def forward(self, input_data: torch.Tensor) -> torch.Tensor:
+            self.output = self.conv(input_data).permute(0, 2, 3, 1)
+            return self.output
+
+    class DetectorMerge(nn.Module):
+        def __init__(self, location_dimmension: int):
+            super().__init__()
+            self.location_dim = location_dimmension
+
+        def forward(self, detector_outputs: torch.Tensor) -> torch.Tensor:
+            return torch.cat(
+                [detector_outputs[:, :, :self.location_dim],
+                 torch.softmax(detector_outputs[:, :, self.location_dim:], dim=2)], dim=2)
+
+    class AnchorInfo:
+        def __init__(self, center: tuple[float, float], size: tuple[float],
+                     index: int, layer_index: int, map_index: tuple[int, int], color_index: int,
+                     ratio: float, size_factor: float):
+            self.index = index
+            self.layer_index = layer_index
+            self.map_index = map_index
+            self.color_index = color_index
+            self.ratio = ratio
+            self.size_factor = size_factor
+            self.center = center
+            self.size = size
+            self.box = check_rectangle(
+                center[0] - (size[0] / 2), center[1] - (size[1] / 2),
+                center[0] + (size[0] / 2), center[1] + (size[1] / 2))
+
+        def __repr__(self):
+            return (f'{self.__class__.__name__}'
+                    f'(index:{self.index}, layer:{self.layer_index}, coord:{self.map_index}'
+                    f', center:({self.center[0]:.03f}, {self.center[1]:.03f})'
+                    f', size:({self.size[0]:.03f}, {self.size[1]:.03f})'
+                    f', ratio:{self.ratio:.03f}, size_factor:{self.size_factor:.03f})'
+                    f', y:[{self.box[0]:.03f}:{self.box[2]:.03f}]'
+                    f', x:[{self.box[1]:.03f}:{self.box[3]:.03f}])')
+
+        def __array__(self):
+            return np.array([*self.center, *self.size])
+
+    def __init__(self, base_network: nn.Module, input_sample: torch.Tensor, classes: list[str],
+                 location_dimmension: int, layer_channels: list[int], layer_box_ratios: list[float], layer_args: dict,
+                 box_size_factors: list[float]):
+        super().__init__()
+
+        self.location_dim = location_dimmension
+        self.classes = ['none'] + classes
+        self.class_count = len(self.classes)
+        self.base_input_shape = input_sample.numpy().shape[1:]
+        self.base_network = base_network
+        sample_output = base_network(input_sample)
+        self.base_output_shape = list(sample_output.detach().numpy().shape)[-3:]
+
+        layer_convs: list[nn.Module] = []
+        layer_detectors: list[SSD.Detector] = []
+        last_feature_count = self.base_output_shape[0]
+        for layer_index, (output_features, kwargs) in enumerate(zip(layer_channels, layer_args)):
+            if 'disable' not in kwargs:
+                layer_convs.append(Conv2d(last_feature_count, output_features, **kwargs))
+            layer_detectors.append(SSD.Detector(
+                last_feature_count, (self.class_count + self.location_dim) * len(layer_box_ratios[layer_index])))
+            # layers.append(SSD.Layer(
+            #     last_feature_count, output_features,
+            #     (self.class_count + self.location_dim) * len(layer_box_ratios[layer_index]),
+            #     **kwargs))
+            last_feature_count = output_features
+        self.layer_convs = nn.ModuleList(layer_convs)
+        self.layer_detectors = nn.ModuleList(layer_detectors)
+
+        self.merge = self.DetectorMerge(location_dimmension)
+
+        self.anchors_numpy, self.anchor_info, self.box_colors = self._create_anchors(
+            sample_output, self.layer_convs, self.layer_detectors, layer_box_ratios, box_size_factors,
+            input_sample.shape[3] / input_sample.shape[2])
+        self.anchors = torch.from_numpy(self.anchors_numpy)
+
+    def forward(self, input_data: torch.Tensor) -> torch.Tensor:
+        head = self.base_network(input_data)
+        detector_outputs = []
+        for layer_index, detector in enumerate(self.layer_detectors):
+            detector_out = detector(head)
+            detector_outputs.append(detector_out.reshape(
+                detector_out.size(0), -1, self.class_count + self.location_dim))
+            if layer_index < len(self.layer_convs):
+                head = self.layer_convs[layer_index](head)
+        detector_outputs = torch.cat(detector_outputs, 1)
+        return self.merge(detector_outputs)
+        # base_output = self.base_network(input_data)
+        # head = base_output
+        # outputs = []
+        # for layer in self.layers:
+        #     head, detector_output = layer(head)
+        #     outputs.append(detector_output.reshape(base_output.size(0), -1, self.class_count + self.location_dim))
+        # outputs = torch.cat(outputs, 1)
+        # return torch.cat(
+        #     [outputs[:, :, :self.location_dim], torch.softmax(outputs[:, :, self.location_dim:], dim=2)], dim=2)
+
+    def _apply(self, fn):
+        super()._apply(fn)
+        self.anchors = fn(self.anchors)
+        return self
+
+    @staticmethod
+    def _create_anchors(
+            base_output: torch.Tensor, layers: nn.ModuleList, detectors: nn.ModuleList, layer_box_ratios: list[float],
+            box_size_factors: list[float], image_ratio: float) -> tuple[np.ndarray, np.ndarray, list[np.ndarray]]:
+        anchors = []
+        anchor_info: list[SSD.AnchorInfo] = []
+        box_colors: list[np.ndarray] = []
+        head = base_output
+
+        for layer_index, detector in enumerate(detectors):
+            detector_output = detector(head)  # detector output shape : NCRSHW (Ratio, Size)
+            if layer_index < len(layers):
+                head = layers[layer_index](head)
+
+            detector_rows = detector_output.size()[1]
+            detector_cols = detector_output.size()[2]
+            color_index = 0
+            layer_ratios = layer_box_ratios[layer_index]
+            for index_y in range(detector_rows):
+                center_y = (index_y + 0.5) / detector_rows
+                for index_x in range(detector_cols):
+                    center_x = (index_x + 0.5) / detector_cols
+                    for ratio, size_factor in zip(layer_ratios, box_size_factors):
+                        box_colors.append((np.asarray(colorsys.hsv_to_rgb(
+                            color_index / len(layer_ratios), 1.0, 1.0)) * 255).astype(np.uint8))
+                        color_index += 1
+                        unit_box_size = size_factor / max(detector_rows, detector_cols)
+                        anchor_width = unit_box_size * math.sqrt(ratio / image_ratio)
+                        anchor_height = unit_box_size / math.sqrt(ratio / image_ratio)
+                        anchor_info.append(SSD.AnchorInfo(
+                            (center_y, center_x),
+                            (anchor_height, anchor_width),
+                            len(anchors),
+                            layer_index,
+                            (index_y, index_x),
+                            len(box_colors) - 1,
+                            ratio,
+                            size_factor
+                        ))
+                        anchors.append([center_y, center_x, anchor_height, anchor_width])
+        return np.asarray(anchors, dtype=np.float32), anchor_info, box_colors

From d87bb89e6ca4ed71a8f3104f83ab1c043c0aaa3c Mon Sep 17 00:00:00 2001
From: Corentin <corentin-pro@mail.com>
Date: Fri, 21 May 2021 15:14:14 +0900
Subject: [PATCH 07/12] SSDLoss implementation

---
 layers.py   |   2 +-
 ssd/loss.py | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 113 insertions(+), 1 deletion(-)
 create mode 100644 ssd/loss.py

diff --git a/layers.py b/layers.py
index 6d511f6..2c9fd9c 100644
--- a/layers.py
+++ b/layers.py
@@ -54,7 +54,7 @@ class Linear(Layer):
     def __init__(self, in_channels: int, out_channels: int, activation=0, batch_norm=None, **kwargs):
         super().__init__(activation, batch_norm)
 
-        self.fc = nn.Linear(in_channels, out_channels, **kwargs)
+        self.fc = nn.Linear(in_channels, out_channels, bias=not self.batch_norm, **kwargs)
         self.batch_norm = nn.BatchNorm1d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
diff --git a/ssd/loss.py b/ssd/loss.py
new file mode 100644
index 0000000..1b3d259
--- /dev/null
+++ b/ssd/loss.py
@@ -0,0 +1,112 @@
+import torch
+import torch.nn as nn
+
+
+class JacardOverlap(nn.Module):
+    def forward(self, anchors: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
+        """
+        Assuming rank 2 (number of boxes, locations), location is (y, x, h, w)
+        Jaccard overlap : A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
+        Return:
+            jaccard overlap: (tensor) Shape: [predictions.size(0), labels.size(0)]
+        """
+        anchors_count = anchors.size(0)
+        labels_count = labels.size(0)
+
+        # Getting coords (y_min, x_min, y_max, x_max) repeated to fill (anchor count, label count)
+        anchor_coords = torch.cat([
+            anchors[:, :2] - (anchors[:, 2:] / 2),
+            anchors[:, :2] + (anchors[:, 2:] / 2)], 1).unsqueeze(1).expand(anchors_count, labels_count, 4)
+        label_coords = torch.cat([
+            labels[:, :2] - (labels[:, 2:] / 2),
+            labels[:, :2] + (labels[:, 2:] / 2)], 1).unsqueeze(0).expand(anchors_count, labels_count, 4)
+
+        mins = torch.max(anchor_coords, label_coords)[:, :, :2]
+        maxes = torch.min(anchor_coords, label_coords)[:, :, 2:]
+
+        inter_coords = torch.clamp(maxes - mins, min=0)
+        inter_area = inter_coords[:, :, 0] * inter_coords[:, :, 1]
+
+        anchor_areas = (anchors[:, 2] * anchors[:, 3]).unsqueeze(1).expand_as(inter_area)
+        label_areas = (labels[:, 2] * labels[:, 3]).unsqueeze(0).expand_as(inter_area)
+
+        union_area = anchor_areas + label_areas - inter_area
+        return inter_area / union_area
+
+
+class SSDLoss(nn.Module):
+    def __init__(self, anchors: torch.Tensor, label_per_image: int,
+                 negative_mining_ratio: int, matching_iou: float,
+                 location_dimmension: int = 4, localization_loss_weight: float = 1.0):
+        super().__init__()
+        self.anchors = anchors
+        self.anchor_count = anchors.size(0)
+        self.label_per_image = label_per_image
+        self.location_dimmension = location_dimmension
+        self.negative_mining_ratio = negative_mining_ratio
+        self.matching_iou = matching_iou
+        self.localization_loss_weight = localization_loss_weight
+
+        self.overlap = JacardOverlap()
+        self.matches = []
+        # self.negative_matches = []
+        self.positive_class_loss = torch.Tensor()
+        self.negative_class_loss = torch.Tensor()
+        self.localization_loss = torch.Tensor()
+        self.class_loss = torch.Tensor()
+        self.final_loss = torch.Tensor()
+
+    def forward(self, input_data: torch.Tensor, input_labels: torch.Tensor) -> torch.Tensor:
+        batch_size = input_data.size(0)
+        expanded_anchors = self.anchors[:, :4].unsqueeze(0).unsqueeze(2).expand(
+            batch_size, self.anchor_count, self.label_per_image, 4)
+        expanded_labels = input_labels[:, :, :self.location_dimmension].unsqueeze(1).expand(
+            batch_size, self.anchor_count, self.label_per_image, self.location_dimmension)
+        objective_pos = (expanded_labels[:, :, :, :2] - expanded_anchors[:, :, :, :2]) / (
+            expanded_anchors[:, :, :, 2:])
+        objective_size = torch.log(expanded_labels[:, :, :, 2:] / expanded_anchors[:, :, :, 2:])
+
+        positive_objectives = []
+        positive_predictions = []
+        positive_class_loss = []
+        negative_class_loss = []
+        self.matches = []
+        # self.negative_matches = []
+        for batch_index in range(batch_size):
+            predictions = input_data[batch_index]
+            labels = input_labels[batch_index]
+            overlaps = self.overlap(self.anchors[:, :4], labels[:, :4])
+            mask = (overlaps >= self.matching_iou).long()
+            match_indices = torch.nonzero(mask, as_tuple=False)
+            self.matches.append(match_indices.detach().cpu())
+
+            mining_count = int(self.negative_mining_ratio * len(self.matches[-1]))
+            masked_prediction = predictions[:, self.location_dimmension] + torch.max(mask, dim=1)[0]
+            non_match_indices = torch.argsort(masked_prediction, dim=-1, descending=False)[:mining_count]
+            # self.negative_matches.append(non_match_indices.detach().cpu())
+
+            for anchor_index, label_index in match_indices:
+                positive_predictions.append(predictions[anchor_index])
+                positive_objectives.append(
+                    torch.cat((
+                        objective_pos[batch_index, anchor_index, label_index],
+                        objective_size[batch_index, anchor_index, label_index]), dim=-1))
+                positive_class_loss.append(torch.log(
+                    predictions[anchor_index, self.location_dimmension + labels[label_index, -1].long()]))
+
+            for anchor_index in non_match_indices:
+                negative_class_loss.append(
+                    torch.log(predictions[anchor_index, self.location_dimmension]))
+
+        if not positive_predictions:
+            return None
+        positive_predictions = torch.stack(positive_predictions)
+        positive_objectives = torch.stack(positive_objectives)
+        self.positive_class_loss = -torch.sum(torch.stack(positive_class_loss))
+        self.negative_class_loss = -torch.sum(torch.stack(negative_class_loss))
+        self.localization_loss = nn.functional.smooth_l1_loss(
+            positive_predictions[:, self.location_dimmension],
+            positive_objectives)
+        self.class_loss = self.positive_class_loss + self.negative_class_loss
+        self.final_loss = (self.localization_loss_weight * self.localization_loss) + self.class_loss
+        return self.final_loss

From 770a9a4f8206d8553006936ceed770d22a155036 Mon Sep 17 00:00:00 2001
From: Corentin <corentin-pro@mail.com>
Date: Fri, 21 May 2021 16:00:16 +0900
Subject: [PATCH 08/12] Avoid use_batch_norm as layers instance variable

---
 layers.py                         | 31 +++++++++++++-----------
 transformer/vision_transformer.py | 40 +++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 14 deletions(-)
 create mode 100644 transformer/vision_transformer.py

diff --git a/layers.py b/layers.py
index 93e7749..3966c43 100644
--- a/layers.py
+++ b/layers.py
@@ -19,7 +19,7 @@ class Layer(nn.Module):
     METRICS = False
     LOGGER = DummyLogger()
 
-    def __init__(self, activation, use_batch_norm):
+    def __init__(self, activation):
         super().__init__()
         # Preload default
         if activation == 0:
@@ -28,28 +28,27 @@ class Layer(nn.Module):
             self.activation = activation()
         else:
             self.activation = activation
-        self.batch_norm: torch.nn._BatchNorm = None
-        self.use_batch_norm = Layer.USE_BATCH_NORM if use_batch_norm is None else use_batch_norm
+        self.batch_norm: torch.nn._BatchNorm
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         output = input_data
-        if self.activation is not None:
+        if self.activation:
             output = self.activation(output)
-        if self.use_batch_norm:
-            # It is assumed here that if using batch norm, then self.batch_norm has been instanciated.
+        if self.batch_norm:
             output = self.batch_norm(output)
         return output
 
 
 class Linear(Layer):
     def __init__(self, in_channels: int, out_channels: int, activation=0, use_batch_norm: bool = None, **kwargs):
-        super().__init__(activation, use_batch_norm)
+        super().__init__(activation)
 
         self.fc = nn.Linear(in_channels, out_channels, bias=not self.batch_norm, **kwargs)
+        use_batch_norm = Layer.USE_BATCH_NORM if use_batch_norm is None else use_batch_norm
         self.batch_norm = nn.BatchNorm1d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
-            track_running_stats=Layer.BATCH_NORM_TRAINING) if self.use_batch_norm else None
+            track_running_stats=Layer.BATCH_NORM_TRAINING) if use_batch_norm else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         return super().forward(self.fc(input_data))
@@ -58,14 +57,15 @@ class Linear(Layer):
 class Conv1d(Layer):
     def __init__(self, in_channels: int, out_channels: int, kernel_size: int = 3,
                  stride: Union[int, Tuple[int, int]] = 1, activation=0, use_batch_norm: bool = None, **kwargs):
-        super().__init__(activation, use_batch_norm)
+        super().__init__(activation)
 
         self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride,
                               bias=not self.use_batch_norm, **kwargs)
+        use_batch_norm = Layer.USE_BATCH_NORM if use_batch_norm is None else use_batch_norm
         self.batch_norm = nn.BatchNorm1d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
-            track_running_stats=Layer.BATCH_NORM_TRAINING) if self.use_batch_norm else None
+            track_running_stats=Layer.BATCH_NORM_TRAINING) if use_batch_norm else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         return super().forward(self.conv(input_data))
@@ -78,6 +78,7 @@ class Conv2d(Layer):
 
         self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride,
                               bias=not self.use_batch_norm, **kwargs)
+        use_batch_norm = Layer.USE_BATCH_NORM if use_batch_norm is None else use_batch_norm
         self.batch_norm = nn.BatchNorm2d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
@@ -90,14 +91,15 @@ class Conv2d(Layer):
 class Conv3d(Layer):
     def __init__(self, in_channels: int, out_channels: int, kernel_size: int = 3,
                  stride: Union[int, Tuple[int, int]] = 1, activation=0, use_batch_norm: bool = None, **kwargs):
-        super().__init__(activation, use_batch_norm)
+        super().__init__(activation)
 
         self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride=stride,
                               bias=not self.use_batch_norm, **kwargs)
+        use_batch_norm = Layer.USE_BATCH_NORM if use_batch_norm is None else use_batch_norm
         self.batch_norm = nn.BatchNorm3d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
-            track_running_stats=Layer.BATCH_NORM_TRAINING) if self.use_batch_norm else None
+            track_running_stats=Layer.BATCH_NORM_TRAINING) if use_batch_norm else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         return super().forward(self.conv(input_data))
@@ -106,15 +108,16 @@ class Conv3d(Layer):
 class Deconv2d(Layer):
     def __init__(self, in_channels: int, out_channels: int, kernel_size: int = 3,
                  stride: Union[int, Tuple[int, int]] = 1, activation=0, use_batch_norm: bool = None, **kwargs):
-        super().__init__(activation, use_batch_norm)
+        super().__init__(activation)
 
         self.deconv = nn.ConvTranspose2d(
             in_channels, out_channels, kernel_size, stride=stride,
             bias=not self.use_batch_norm, **kwargs)
+        use_batch_norm = Layer.USE_BATCH_NORM if use_batch_norm is None else use_batch_norm
         self.batch_norm = nn.BatchNorm2d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
-            track_running_stats=Layer.BATCH_NORM_TRAINING) if self.use_batch_norm else None
+            track_running_stats=Layer.BATCH_NORM_TRAINING) if use_batch_norm else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         return super().forward(self.deconv(input_data))
diff --git a/transformer/vision_transformer.py b/transformer/vision_transformer.py
new file mode 100644
index 0000000..2e5ef4b
--- /dev/null
+++ b/transformer/vision_transformer.py
@@ -0,0 +1,40 @@
+import torch
+import torch.nn as nn
+
+
+class Attention(nn.Module):
+    def __init__(self, dim: int, head_count: int = None, qkv_bias: bool = False, qk_scale: float = None,
+                 attention_drop: float = None, projection_drop: float = None):
+        super().__init__()
+        self.head_count = head_count
+        head_dim = dim // head_count
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attention_drop = nn.Dropout(
+            attention_drop if attention_drop is not None else VisionTransformer.ATTENTION_DROP)
+        self.projector = nn.Linear(dim, dim)
+        self.projection_drop = nn.Dropout(
+            projection_drop if projection_drop is not None else VisionTransformer.PROJECTION_DROP)
+
+    def foward(self, input_data: torch.Tensor) -> torch.Tensor:
+        batch_size, sequence_length, channel_count = input_data.shape
+        qkv = self.qkv(input_data).reshape(
+            batch_size, sequence_length, 3, self.head_count, channel_count // self.head_count).permute(
+                2, 0, 3, 1, 4)
+        # (output shape : 3, batch_size, head_ctoun, sequence_lenght, channel_count / head_count)
+        query, key, value = qkv[0], qkv[1], qkv[2]
+        attention = self.attention_drop(((query @ key.transpose(-2, -1)) * self.scale).softmax(dim=-1))
+        return self.projection_drop(self.projector(
+            (attention @ value).transpose(1, 2).reshape(batch_size, sequence_length, channel_count)))
+
+
+class VisionTransformer(nn.Module):
+    HEAD_COUNT = 8
+    MLP_RATIO = 4.0
+    QKV_BIAS = False
+    ATTENTION_DROP = 0.0
+    PROJECTION_DROP = 0.0
+
+    def __init__(self, dim: int, head_count: int, mlp_ratio: float = None,
+                 qkv_bias: bool = None

From 06db437aa40655b517e55c6732d38ad255002134 Mon Sep 17 00:00:00 2001
From: Corentin <corentin-pro@mail.com>
Date: Sat, 22 May 2021 01:18:39 +0900
Subject: [PATCH 09/12] Vision Transformer

---
 layers.py                         |  60 ++++++++---
 transformer/vision_transformer.py | 170 +++++++++++++++++++++++++++---
 2 files changed, 200 insertions(+), 30 deletions(-)

diff --git a/layers.py b/layers.py
index 3966c43..d27697e 100644
--- a/layers.py
+++ b/layers.py
@@ -38,20 +38,35 @@ class Layer(nn.Module):
             output = self.batch_norm(output)
         return output
 
+    @staticmethod
+    def add_weight_decay(module: nn.Module, weight_decay: float, exclude=()):
+        decay = []
+        no_decay = []
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue
+            if len(param.shape) == 1 or name.endswith('.bias') or name in exclude:
+                no_decay.append(param)
+            else:
+                decay.append(param)
+        return [
+            {'params': no_decay, 'weight_decay': 0.0},
+            {'params': decay, 'weight_decay': weight_decay}]
+
 
 class Linear(Layer):
     def __init__(self, in_channels: int, out_channels: int, activation=0, use_batch_norm: bool = None, **kwargs):
         super().__init__(activation)
 
-        self.fc = nn.Linear(in_channels, out_channels, bias=not self.batch_norm, **kwargs)
         use_batch_norm = Layer.USE_BATCH_NORM if use_batch_norm is None else use_batch_norm
+        self.linear = nn.Linear(in_channels, out_channels, bias=not use_batch_norm, **kwargs)
         self.batch_norm = nn.BatchNorm1d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
             track_running_stats=Layer.BATCH_NORM_TRAINING) if use_batch_norm else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
-        return super().forward(self.fc(input_data))
+        return super().forward(self.linear(input_data))
 
 
 class Conv1d(Layer):
@@ -59,9 +74,9 @@ class Conv1d(Layer):
                  stride: Union[int, Tuple[int, int]] = 1, activation=0, use_batch_norm: bool = None, **kwargs):
         super().__init__(activation)
 
-        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride,
-                              bias=not self.use_batch_norm, **kwargs)
         use_batch_norm = Layer.USE_BATCH_NORM if use_batch_norm is None else use_batch_norm
+        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride,
+                              bias=use_batch_norm, **kwargs)
         self.batch_norm = nn.BatchNorm1d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
@@ -72,30 +87,30 @@ class Conv1d(Layer):
 
 
 class Conv2d(Layer):
-    def __init__(self, in_channels: int, out_channels: int, kernel_size: int = 3,
-                 stride: Union[int, Tuple[int, int]] = 1, activation=0, use_batch_norm: bool = None, **kwargs):
-        super().__init__(activation, use_batch_norm)
+    def __init__(self, in_channels: int, out_channels: int, kernel_size: Union[int, tuple[int, int]] = 3,
+                 stride: Union[int, tuple[int, int]] = 1, activation=0, use_batch_norm: bool = None, **kwargs):
+        super().__init__(activation)
 
-        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride,
-                              bias=not self.use_batch_norm, **kwargs)
         use_batch_norm = Layer.USE_BATCH_NORM if use_batch_norm is None else use_batch_norm
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride,
+                              bias=not use_batch_norm, **kwargs)
         self.batch_norm = nn.BatchNorm2d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
-            track_running_stats=Layer.BATCH_NORM_TRAINING) if self.use_batch_norm else None
+            track_running_stats=Layer.BATCH_NORM_TRAINING) if use_batch_norm else None
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         return super().forward(self.conv(input_data))
 
 
 class Conv3d(Layer):
-    def __init__(self, in_channels: int, out_channels: int, kernel_size: int = 3,
+    def __init__(self, in_channels: int, out_channels: int, kernel_size: Union[int, tuple[int, int, int]] = 3,
                  stride: Union[int, Tuple[int, int]] = 1, activation=0, use_batch_norm: bool = None, **kwargs):
         super().__init__(activation)
 
-        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride=stride,
-                              bias=not self.use_batch_norm, **kwargs)
         use_batch_norm = Layer.USE_BATCH_NORM if use_batch_norm is None else use_batch_norm
+        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride=stride,
+                              bias=use_batch_norm, **kwargs)
         self.batch_norm = nn.BatchNorm3d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
@@ -110,10 +125,10 @@ class Deconv2d(Layer):
                  stride: Union[int, Tuple[int, int]] = 1, activation=0, use_batch_norm: bool = None, **kwargs):
         super().__init__(activation)
 
+        use_batch_norm = Layer.USE_BATCH_NORM if use_batch_norm is None else use_batch_norm
         self.deconv = nn.ConvTranspose2d(
             in_channels, out_channels, kernel_size, stride=stride,
-            bias=not self.use_batch_norm, **kwargs)
-        use_batch_norm = Layer.USE_BATCH_NORM if use_batch_norm is None else use_batch_norm
+            bias=not use_batch_norm, **kwargs)
         self.batch_norm = nn.BatchNorm2d(
             out_channels,
             momentum=Layer.BATCH_NORM_MOMENTUM,
@@ -121,3 +136,18 @@ class Deconv2d(Layer):
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         return super().forward(self.deconv(input_data))
+
+
+class DropPath(nn.Module):
+    def __init__(self, drop_prob=None):
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, input_data: torch.Tensor) -> torch.Tensor:
+        if self.drop_prob == 0.0:
+            return input_data
+        keep_prob = 1 - self.drop_prob
+        shape = (input_data.shape[0],) + (1,) * (input_data.ndim - 1)
+        random_tensor = keep_prob + torch.rand(shape, dtype=input_data.dtype, device=input_data.device)
+        random_tensor.floor_()  # binarize
+        return input_data.div(keep_prob) * random_tensor
diff --git a/transformer/vision_transformer.py b/transformer/vision_transformer.py
index 2e5ef4b..f066417 100644
--- a/transformer/vision_transformer.py
+++ b/transformer/vision_transformer.py
@@ -1,23 +1,42 @@
+from functools import partial
+import math
+
+import numpy as np
 import torch
 import torch.nn as nn
 
+from ..layers import DropPath, Layer
+
+
+class PatchEmbed(nn.Module):
+    def __init__(self, image_shape: tuple[int, int], patch_size: int = 16,
+                 in_channels: int = 3, embed_dim: int = 768):
+        super().__init__()
+        patch_count = (image_shape[0] // patch_size) * (image_shape[1] // patch_size)
+        self.image_shape = image_shape
+        self.patch_size = patch_size
+        self.patch_count = patch_count
+
+        self.projector = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, input_data: torch.Tensor) -> torch.Tensor:
+        return self.projector(input_data).flatten(2).transpose(1, 2)
+
 
 class Attention(nn.Module):
-    def __init__(self, dim: int, head_count: int = None, qkv_bias: bool = False, qk_scale: float = None,
-                 attention_drop: float = None, projection_drop: float = None):
+    def __init__(self, dim: int, head_count: int, qkv_bias: bool, qk_scale: float,
+                 attention_drop_rate: float, projection_drop_rate: float):
         super().__init__()
         self.head_count = head_count
         head_dim = dim // head_count
         self.scale = qk_scale or head_dim ** -0.5
 
         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attention_drop = nn.Dropout(
-            attention_drop if attention_drop is not None else VisionTransformer.ATTENTION_DROP)
+        self.attention_drop = nn.Dropout(attention_drop_rate) if attention_drop_rate > 0.0 else nn.Identity()
         self.projector = nn.Linear(dim, dim)
-        self.projection_drop = nn.Dropout(
-            projection_drop if projection_drop is not None else VisionTransformer.PROJECTION_DROP)
+        self.projection_drop = nn.Dropout(projection_drop_rate) if projection_drop_rate > 0.0 else nn.Identity()
 
-    def foward(self, input_data: torch.Tensor) -> torch.Tensor:
+    def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         batch_size, sequence_length, channel_count = input_data.shape
         qkv = self.qkv(input_data).reshape(
             batch_size, sequence_length, 3, self.head_count, channel_count // self.head_count).permute(
@@ -29,12 +48,133 @@ class Attention(nn.Module):
             (attention @ value).transpose(1, 2).reshape(batch_size, sequence_length, channel_count)))
 
 
-class VisionTransformer(nn.Module):
-    HEAD_COUNT = 8
-    MLP_RATIO = 4.0
-    QKV_BIAS = False
-    ATTENTION_DROP = 0.0
-    PROJECTION_DROP = 0.0
+class Block(nn.Module):
+    def __init__(self, dim: int, head_count: int, mlp_ratio: float,
+                 qkv_bias: bool, qk_scale: float, drop_rate: float,
+                 attention_drop_rate: float, drop_path_rate: float,
+                 norm_layer=0, activation=0):
+        super().__init__()
 
-    def __init__(self, dim: int, head_count: int, mlp_ratio: float = None,
-                 qkv_bias: bool = None
+        self.norm1 = norm_layer(dim)
+        self.attention = Attention(dim, head_count, qkv_bias, qk_scale, attention_drop_rate, drop_rate)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, int(dim * mlp_ratio)),
+            activation(),
+            nn.Linear(int(dim * mlp_ratio), dim),
+            nn.Dropout(drop_rate))
+
+    def forward(self, input_data: torch.Tensor) -> torch.Tensor:
+        out = input_data + self.drop_path(self.attention(self.norm1(input_data)))
+        return out + self.drop_path(self.mlp(self.norm2(out)))
+
+
+class VissionTransformer(nn.Module):
+    QK_SCALE = None
+    ACTIVATION = 0
+    NORM_LAYER = nn.LayerNorm
+
+    def __init__(self, image_shape: tuple[int, int, int], class_count: int, depth: int,
+                 path_size: int = 16, embed_dim: int = 768,
+                 head_count: int = 8, mlp_ratio: float = 4.0, qkv_bias: bool = True, qk_scale: float = None,
+                 representation_size=None, distilled: bool = False, drop_rate: float = 0.0,
+                 attention_drop_rate: float = 0.0, drop_path_rate: float = 0.0, embed_layer=PatchEmbed,
+                 norm_layer=0, activation=0):
+        super().__init__()
+        qk_scale = qk_scale if qk_scale is not None else self.QK_SCALE
+        activation = activation if activation != 0 else self.ACTIVATION
+        activation = activation if activation != 0 else Layer.ACTIVATION
+        norm_layer = norm_layer if norm_layer != 0 else self.NORM_LAYER
+
+        self.class_count = class_count
+        self.feature_count = self.embed_dim = embed_dim
+        self.distilled = distilled
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+
+        self.patch_embed = embed_layer(image_shape[1:], patch_size=path_size,
+                                       in_channels=image_shape[0], embed_dim=embed_dim)
+        patch_count = self.patch_embed.patch_count
+        token_count = 2 if distilled else 1
+
+        self.class_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.distillation_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if distilled else None
+        self.position_embedings = nn.Parameter(torch.zeros(1, patch_count + token_count, embed_dim))
+        self.position_drop = nn.Dropout(drop_rate) if drop_rate > 0.0 else nn.Identity()
+
+        depth_path_drop_rates = np.linspace(0, drop_path_rate, depth) if drop_path_rate > 0.0 else [0.0] * depth
+        self.blocks = nn.Sequential(*[
+            Block(embed_dim, head_count, mlp_ratio, qkv_bias, qk_scale, drop_rate, attention_drop_rate,
+                  pdr, norm_layer, activation) for pdr in depth_path_drop_rates])
+        self.norm = norm_layer(embed_dim)
+
+        # Representation Layer
+        if representation_size and not distilled:
+            self.feature_count = representation_size
+            self.pre_logits = nn.Sequential(
+                nn.Linear(embed_dim, representation_size),
+                nn.Tanh())
+        else:
+            self.pre_logits = nn.Identity()
+
+        # Final classifier
+        self.head = nn.Linear(self.feature_count, class_count) if class_count > 0 else nn.Identity()
+        self.head_distilled = nn.Linear(
+            self.embed_dim, self.class_count) if class_count > 0 and distilled else nn.Identity()
+
+        # Init weights
+        nn.init.trunc_normal_(self.class_token, std=0.02)
+        nn.init.trunc_normal_(self.position_embedings, std=0.02)
+        if self.distilled:
+            nn.init.trunc_normal_(self.distillation_token, std=0.02)
+
+        self.apply(partial(self._init_weights, head_bias=-math.log(self.class_count)))
+
+
+    @torch.jit.ignore
+    def no_weight_decay(self) -> dict:
+        return {'class_token', 'distillation_token', 'position_embedings'}
+
+    def get_classifier(self):
+        return self.head if self.distillation_token is None else (self.head, self.head_distilled)
+
+    def reset_classifier(self, class_count: int):
+        self.class_count = class_count
+        self.head = nn.Linear(self.feature_count, class_count) if class_count > 0 else nn.Identity()
+        self.head_distilled = nn.Linear(
+            self.embed_dim, self.class_count) if class_count > 0 and self.distilled else nn.Identity()
+
+    def forward(self, input_data: torch.Tensor) -> torch.Tensor:
+        embedings = self.patch_embed(input_data)
+        class_token = self.class_token.expand(embedings.shape[0], -1, -1)
+
+        if self.distilled:
+            block_output = self.norm(self.blocks(self.position_drop(
+                torch.cat((class_token, self.distillation_token.expand(embedings.shape[0], -1, -1), embedings), dim=1)
+                + self.position_embedings)))
+            distilled_head_output = self.head_distilled(block_output[:, 1])
+            head_output = self.head(block_output[:, 0])
+            if self.training and not torch.jit.is_scripting():
+                return head_output, distilled_head_output
+            return (head_output + distilled_head_output) / 2.0
+
+        block_output = self.norm(self.blocks(self.position_drop(
+            torch.cat((class_token, embedings), dim=1) + self.position_embedings)))
+        return self.head(self.pre_logits(block_output[:, 0]))
+
+    @staticmethod
+    def _init_weights(module: nn.Module, name: str = '', head_bias: float = 0.0):
+        if isinstance(module, nn.Linear):
+            if name.startswith('head'):
+                nn.init.zeros_(module.weight)
+                nn.init.constant_(module.bias, head_bias)
+            elif name.startswith('pre_logits'):
+                nn.init.xavier_normal_(module.weight)
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Conv2d):
+            nn.init.xavier_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            nn.init.ones_(module.weight)
+            nn.init.zeros_(module.bias)

From 0cf142571beee7b4207ae733a12580263ea4fabb Mon Sep 17 00:00:00 2001
From: Corentin <corentin-pro@mail.com>
Date: Tue, 25 May 2021 14:06:22 +0900
Subject: [PATCH 10/12] Fix typos

---
 transformer/vision_transformer.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/transformer/vision_transformer.py b/transformer/vision_transformer.py
index f066417..04195e2 100644
--- a/transformer/vision_transformer.py
+++ b/transformer/vision_transformer.py
@@ -1,3 +1,9 @@
+"""
+Data efficent image transformer (deit)
+from https://github.com/facebookresearch/deit, https://arxiv.org/abs/2012.12877
+"""
+
+
 from functools import partial
 import math
 
@@ -41,7 +47,7 @@ class Attention(nn.Module):
         qkv = self.qkv(input_data).reshape(
             batch_size, sequence_length, 3, self.head_count, channel_count // self.head_count).permute(
                 2, 0, 3, 1, 4)
-        # (output shape : 3, batch_size, head_ctoun, sequence_lenght, channel_count / head_count)
+        # (output shape : 3, batch_size, head_count, sequence_lenght, channel_count / head_count)
         query, key, value = qkv[0], qkv[1], qkv[2]
         attention = self.attention_drop(((query @ key.transpose(-2, -1)) * self.scale).softmax(dim=-1))
         return self.projection_drop(self.projector(
@@ -76,7 +82,7 @@ class VissionTransformer(nn.Module):
     NORM_LAYER = nn.LayerNorm
 
     def __init__(self, image_shape: tuple[int, int, int], class_count: int, depth: int,
-                 path_size: int = 16, embed_dim: int = 768,
+                 patch_size: int = 16, embed_dim: int = 768,
                  head_count: int = 8, mlp_ratio: float = 4.0, qkv_bias: bool = True, qk_scale: float = None,
                  representation_size=None, distilled: bool = False, drop_rate: float = 0.0,
                  attention_drop_rate: float = 0.0, drop_path_rate: float = 0.0, embed_layer=PatchEmbed,
@@ -92,7 +98,7 @@ class VissionTransformer(nn.Module):
         self.distilled = distilled
         norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
 
-        self.patch_embed = embed_layer(image_shape[1:], patch_size=path_size,
+        self.patch_embed = embed_layer(image_shape[1:], patch_size=patch_size,
                                        in_channels=image_shape[0], embed_dim=embed_dim)
         patch_count = self.patch_embed.patch_count
         token_count = 2 if distilled else 1
@@ -128,6 +134,7 @@ class VissionTransformer(nn.Module):
         if self.distilled:
             nn.init.trunc_normal_(self.distillation_token, std=0.02)
 
+        # Applying weights initialization made no difference so far
         self.apply(partial(self._init_weights, head_bias=-math.log(self.class_count)))
 
 
@@ -171,10 +178,11 @@ class VissionTransformer(nn.Module):
             elif name.startswith('pre_logits'):
                 nn.init.xavier_normal_(module.weight)
                 nn.init.zeros_(module.bias)
-        elif isinstance(module, nn.Conv2d):
-            nn.init.xavier_normal_(module.weight)
-            if module.bias is not None:
-                nn.init.zeros_(module.bias)
+        # pytorch init for conv is fine
+        # elif isinstance(module, nn.Conv2d):
+        #     nn.init.xavier_normal_(module.weight)
+        #     if module.bias is not None:
+        #         nn.init.zeros_(module.bias)
         elif isinstance(module, nn.LayerNorm):
             nn.init.ones_(module.weight)
             nn.init.zeros_(module.bias)

From 1bac46219b42fe41ba3568fdde3ca364b02e46e9 Mon Sep 17 00:00:00 2001
From: Corentin <corentin-pro@mail.com>
Date: Tue, 17 Aug 2021 15:54:35 +0900
Subject: [PATCH 11/12] Fix dropouts and typos in ViT

---
 transformer/vision_transformer.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/transformer/vision_transformer.py b/transformer/vision_transformer.py
index 04195e2..bdca511 100644
--- a/transformer/vision_transformer.py
+++ b/transformer/vision_transformer.py
@@ -1,6 +1,7 @@
 """
 Data efficent image transformer (deit)
 from https://github.com/facebookresearch/deit, https://arxiv.org/abs/2012.12877
+And Vit : https://arxiv.org/abs/2010.11929
 """
 
 
@@ -29,7 +30,7 @@ class PatchEmbed(nn.Module):
         return self.projector(input_data).flatten(2).transpose(1, 2)
 
 
-class Attention(nn.Module):
+class SelfAttention(nn.Module):
     def __init__(self, dim: int, head_count: int, qkv_bias: bool, qk_scale: float,
                  attention_drop_rate: float, projection_drop_rate: float):
         super().__init__()
@@ -38,9 +39,9 @@ class Attention(nn.Module):
         self.scale = qk_scale or head_dim ** -0.5
 
         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attention_drop = nn.Dropout(attention_drop_rate) if attention_drop_rate > 0.0 else nn.Identity()
+        self.attention_drop = nn.Dropout(attention_drop_rate)
         self.projector = nn.Linear(dim, dim)
-        self.projection_drop = nn.Dropout(projection_drop_rate) if projection_drop_rate > 0.0 else nn.Identity()
+        self.projection_drop = nn.Dropout(projection_drop_rate)
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
         batch_size, sequence_length, channel_count = input_data.shape
@@ -62,7 +63,7 @@ class Block(nn.Module):
         super().__init__()
 
         self.norm1 = norm_layer(dim)
-        self.attention = Attention(dim, head_count, qkv_bias, qk_scale, attention_drop_rate, drop_rate)
+        self.attention = SelfAttention(dim, head_count, qkv_bias, qk_scale, attention_drop_rate, drop_rate)
         self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
         self.norm2 = norm_layer(dim)
         self.mlp = nn.Sequential(
@@ -105,7 +106,7 @@ class VissionTransformer(nn.Module):
 
         self.class_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
         self.distillation_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if distilled else None
-        self.position_embedings = nn.Parameter(torch.zeros(1, patch_count + token_count, embed_dim))
+        self.position_embeddings = nn.Parameter(torch.zeros(1, patch_count + token_count, embed_dim))
         self.position_drop = nn.Dropout(drop_rate) if drop_rate > 0.0 else nn.Identity()
 
         depth_path_drop_rates = np.linspace(0, drop_path_rate, depth) if drop_path_rate > 0.0 else [0.0] * depth
@@ -130,17 +131,16 @@ class VissionTransformer(nn.Module):
 
         # Init weights
         nn.init.trunc_normal_(self.class_token, std=0.02)
-        nn.init.trunc_normal_(self.position_embedings, std=0.02)
+        nn.init.trunc_normal_(self.position_embeddings, std=0.02)
         if self.distilled:
             nn.init.trunc_normal_(self.distillation_token, std=0.02)
 
         # Applying weights initialization made no difference so far
         self.apply(partial(self._init_weights, head_bias=-math.log(self.class_count)))
 
-
     @torch.jit.ignore
     def no_weight_decay(self) -> dict:
-        return {'class_token', 'distillation_token', 'position_embedings'}
+        return {'class_token', 'distillation_token', 'position_embeddings'}
 
     def get_classifier(self):
         return self.head if self.distillation_token is None else (self.head, self.head_distilled)
@@ -152,13 +152,13 @@ class VissionTransformer(nn.Module):
             self.embed_dim, self.class_count) if class_count > 0 and self.distilled else nn.Identity()
 
     def forward(self, input_data: torch.Tensor) -> torch.Tensor:
-        embedings = self.patch_embed(input_data)
-        class_token = self.class_token.expand(embedings.shape[0], -1, -1)
+        embeddings = self.patch_embed(input_data)
+        class_token = self.class_token.expand(embeddings.shape[0], -1, -1)
 
         if self.distilled:
             block_output = self.norm(self.blocks(self.position_drop(
-                torch.cat((class_token, self.distillation_token.expand(embedings.shape[0], -1, -1), embedings), dim=1)
-                + self.position_embedings)))
+                torch.cat((class_token, self.distillation_token.expand(embeddings.shape[0], -1, -1), embeddings), dim=1)
+                + self.position_embeddings)))
             distilled_head_output = self.head_distilled(block_output[:, 1])
             head_output = self.head(block_output[:, 0])
             if self.training and not torch.jit.is_scripting():
@@ -166,7 +166,7 @@ class VissionTransformer(nn.Module):
             return (head_output + distilled_head_output) / 2.0
 
         block_output = self.norm(self.blocks(self.position_drop(
-            torch.cat((class_token, embedings), dim=1) + self.position_embedings)))
+            torch.cat((class_token, embeddings), dim=1) + self.position_embeddings)))
         return self.head(self.pre_logits(block_output[:, 0]))
 
     @staticmethod

From 63592580613c0fc15f97b727b64b40d3d869db44 Mon Sep 17 00:00:00 2001
From: Corentin Risselin <corentin.pro@mail.com>
Date: Wed, 4 Jan 2023 16:58:48 +0900
Subject: [PATCH 12/12] Small fix, making h5py optional

---
 utils/batch_generator.py          |  4 ++--
 utils/sequence_batch_generator.py | 17 +++++++++--------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/utils/batch_generator.py b/utils/batch_generator.py
index b8257ef..ad02b55 100644
--- a/utils/batch_generator.py
+++ b/utils/batch_generator.py
@@ -3,7 +3,6 @@ from multiprocessing import shared_memory
 import os
 from typing import Callable, Iterable, Optional, Tuple
 
-import h5py
 import numpy as np
 
 
@@ -20,6 +19,7 @@ class BatchGenerator:
         self.num_workers = num_workers
         self.flip_data = flip_data
         self.pipeline = pipeline
+        self.process_id = 'NA'
 
         if not preload:
             self.data_processor = data_processor
@@ -37,6 +37,7 @@ class BatchGenerator:
                     os.makedirs(os.path.dirname(save_path))
 
             if save and os.path.exists(save_path):
+                import h5py
                 with h5py.File(save_path, 'r') as h5_file:
                     self.data = np.asarray(h5_file['data'])
                     self.label = np.asarray(h5_file['label'])
@@ -80,7 +81,6 @@ class BatchGenerator:
         self.batch_data = first_data
         self.batch_label = first_label
 
-        self.process_id = 'NA'
         if self.prefetch or self.num_workers > 1:
             self.cache_memory_indices = shared_memory.SharedMemory(create=True, size=self.index_list.nbytes)
             self.cache_indices = np.ndarray(
diff --git a/utils/sequence_batch_generator.py b/utils/sequence_batch_generator.py
index 005caf3..947a0b2 100644
--- a/utils/sequence_batch_generator.py
+++ b/utils/sequence_batch_generator.py
@@ -3,7 +3,6 @@ from multiprocessing import shared_memory
 import os
 from typing import Callable, Iterable, Optional
 
-import h5py
 import numpy as np
 
 try:
@@ -25,12 +24,13 @@ class SequenceGenerator(BatchGenerator):
         self.prefetch = prefetch and not preload
         self.num_workers = num_workers
         self.pipeline = pipeline
+        self.process_id = 'NA'
 
         if not preload:
             self.data_processor = data_processor
             self.label_processor = label_processor
-            self.data = np.asarray(data)
-            self.label = np.asarray(label)
+            self.data = np.asarray(data, dtype=np.object)
+            self.label = np.asarray(label, dtype=np.object)
         else:
             self.data_processor = None
             self.label_processor = None
@@ -42,6 +42,7 @@ class SequenceGenerator(BatchGenerator):
                     os.makedirs(os.path.dirname(save_path))
 
             if save and os.path.exists(save_path):
+                import h5py
                 with h5py.File(save_path, 'r') as h5_file:
                     data_len = np.asarray(h5_file['data_len'])
                     self.data = []
@@ -49,22 +50,23 @@ class SequenceGenerator(BatchGenerator):
                     for sequence_index in range(data_len):
                         self.data.append(np.asarray(h5_file[f'data_{sequence_index}']))
                         self.label.append(np.asarray(h5_file[f'label_{sequence_index}']))
-                    self.data = np.asarray(self.data)
-                    self.label = np.asarray(self.label)
+                    self.data = np.asarray(self.data, dtype=np.object)
+                    self.label = np.asarray(self.label, dtype=np.object)
             else:
                 if data_processor:
                     self.data = np.asarray(
                         [np.asarray([data_processor(entry) for entry in serie]) for serie in data],
                         dtype=np.object if len(data) > 1 else None)
                 else:
-                    self.data = np.asarray(data)
+                    self.data = np.asarray(data, dtype=np.object)
                 if label_processor:
                     self.label = np.asarray(
                         [np.asarray([label_processor(entry) for entry in serie]) for serie in label],
                         dtype=np.object if len(label) > 1 else None)
                 else:
-                    self.label = np.asarray(label)
+                    self.label = np.asarray(label, dtype=np.object)
                 if save:
+                    import h5py
                     with h5py.File(save_path, 'w') as h5_file:
                         h5_file.create_dataset('data_len', data=len(self.data))
                         for sequence_index in range(len(self.data)):
@@ -133,7 +135,6 @@ class SequenceGenerator(BatchGenerator):
         self.batch_data = first_data
         self.batch_label = first_label
 
-        self.process_id = 'NA'
         if self.prefetch or self.num_workers > 1:
             self.cache_memory_indices = shared_memory.SharedMemory(create=True, size=self.index_list.nbytes)
             self.cache_indices = np.ndarray(