Jax implementation, code factorisation

* Compatibility for older python version (typing)
2021-10-01 20:14:00 +09:00 · 2021-10-01 20:14:00 +09:00 · 16b7239cd7
commit 16b7239cd7
parent 4b2bcfe7e8
37 changed files with 1007 additions and 293 deletions
--- a/benchmark.py
+++ b/benchmark.py
@ -2,28 +2,46 @@ from argparse import ArgumentParser
 import multiprocessing as mp
 import os
 from pathlib import Path
-from typing import Type
+import sys
+from typing import List, Type

+from config.benchmark import Config
 from src.base import BenchBase
 from src.common import DataType, Op, Platform
+from src.plot import compare


 def run_benchmark(output_path: Path, platform: Platform, data_type: DataType, bench_op: Op,
                  bench_args, bench_count: int):
-    if platform == Platform.TF2:
+    if platform == Platform.JAX:
+        from src.jax.ops import jax_ops
+        if bench_op not in jax_ops:
+            print(f'Operation {bench_op.value} is not implemented for {platform.value} yet')
+        else:
+            jax_ops[bench_op](output_path, data_type).run(bench_args, bench_count)
+            print()
+    elif platform == Platform.TF2:
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
        from src.tf_2.ops import tf2_ops
        if bench_op not in tf2_ops:
            print(f'Operation {bench_op.value} is not implemented for {platform.value} yet')
        else:
-            tf2_ops[bench_op](output_path).run(bench_args, bench_count, data_type)
+            tf2_ops[bench_op](output_path, data_type).run(bench_args, bench_count)
+            print()
+    elif platform == Platform.TF2_V1:
+        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+        from src.tf_2_v1.ops import tf2v1_ops
+        if bench_op not in tf2v1_ops:
+            print(f'Operation {bench_op.value} is not implemented for {platform.value} yet')
+        else:
+            tf2v1_ops[bench_op](output_path, data_type).run(bench_args, bench_count)
            print()
    elif platform == Platform.TORCH:
        from src.pytorch.ops import torch_ops
        if bench_op not in torch_ops:
            print(f'Operation {bench_op.value} is not implemented for {platform.value} yet')
        else:
-            torch_ops[bench_op](output_path).run(bench_args, bench_count, data_type)
+            torch_ops[bench_op](output_path, data_type).run(bench_args, bench_count)
            print()
    else:
        print(f'Platform {platform.value} is not implemented yet')
@ -32,6 +50,8 @@ def run_benchmark(output_path: Path, platform: Platform, data_type: DataType, be
 def main():
    parser = ArgumentParser()
    parser.add_argument('--output', type=Path, default=Path('output'), help='Path to output files')
+    parser.add_argument('--no-benchmark', action='store_true', default=False, help='Avoid running benchmarks')
+    parser.add_argument('--no-compare', action='store_true', default=False, help='Avoid running platform comparaison')
    parser.add_argument('--count', type=int, default=30,
                        help='Number of experiments per benchmark (for stastistical analysis)')
    parser.add_argument('--platform', nargs='*', type=Platform,
@ -39,62 +59,56 @@ def main():
    parser.add_argument('--data', nargs='*', type=DataType,
                        help='List of data type to benchmark [float16, float32, float64] (else all are used)')
    parser.add_argument('--op', nargs='*', type=Op,
-                        help='List of operation to benchmark [add, mul, div, matmul] (else all are used)')
+                        help='List of operation to benchmark (add, mul, div, matmul, etc) (else all are used)')
+    parser.add_argument('--list-op', action='store_true',
+                        help='List all possible operation to benchmark (no further action will be done)')
+    parser.add_argument(
+        '--experiment-time', type=float,
+        help=f'Change time (in s) per experiment (default={Config.EXPERIMENT_TIME:0.3f}s)')
    arguments = parser.parse_args()

+    if arguments.list_op:
+        print(', '.join([op.value for op in Op]))
+        sys.exit(0)
+
    output_path: Path = arguments.output
+    no_benchmark: bool = arguments.no_benchmark
+    no_compare: bool = arguments.no_compare
    bench_count: int = arguments.count
-    platforms: list[Platform] = arguments.platform if arguments.platform is not None else list(Platform)
-    data: list[DataType] = arguments.data if arguments.data is not None else list(DataType)
-    bench_ops: list[Op] = arguments.op if arguments.op is not None else list(Op)
+    platforms: List[Platform] = arguments.platform if arguments.platform is not None else list(Platform)
+    data: List[DataType] = arguments.data if arguments.data is not None else list(DataType)
+    bench_ops: List[Op] = arguments.op if arguments.op is not None else list(Op)
+
+    if arguments.experiment_time:
+        Config.EXPERIMENT_TIME = arguments.experiment_time

    if not output_path.exists():
        output_path.mkdir(parents=True)

-    benchmarks: list[dict[Op, Type[BenchBase]]] = []
-    element_wise_args = [
-        (100, 100),
-        (100, 200),
-        (128, 128),
-        (200, 100),
-        (200, 200),
-        (256, 256),
-        (256, 512),
-        (512, 256),
-        (400, 400),
-        (512, 512),
-        (800, 800),
-        (1024, 1024),
-        (1800, 1800)]
-    matmul_args = [
-        ((100, 100), (100, 100)),
-        ((100, 200), (200, 100)),
-        ((128, 128), (128, 128)),
-        ((200, 100), (100, 200)),
-        ((200, 200), (200, 200)),
-        ((256, 256), (256, 256)),
-        ((256, 512), (512, 256)),
-        ((400, 400), (400, 400)),
-        ((512, 256), (256, 512)),
-        ((512, 512), (512, 512)),
-        ((800, 800), (800, 800)),
-        ((1000, 1000), (1000, 1000)),
-        ((1200, 1200), (1200, 1200))]
+    if not no_benchmark:
+        benchmarks: List[dict[Op, Type[BenchBase]]] = []
+        for platform in platforms:
+            for data_type in data:
+                for bench_op in [Op.ADD, Op.MUL, Op.DIV]:
+                    if bench_op in bench_ops:
+                        benchmarks.append((output_path, platform, data_type, bench_op,
+                                           Config.ELEMENT_WISE_ARGS, bench_count))
+                for bench_op in [Op.MATMUL, Op.NN_MATMUL]:
+                    if bench_op in bench_ops:
+                        benchmarks.append((output_path, platform, data_type, bench_op, Config.MATMUL_ARGS, bench_count))
+                if Op.NN_DENSE in bench_ops:
+                    benchmarks.append((output_path, platform, data_type, Op.NN_DENSE, Config.NN_1D_ARGS, bench_count))

-    for platform in platforms:
-        for data_type in data:
-            for bench_op in [Op.ADD, Op.MUL, Op.DIV]:
-                if bench_op in bench_ops:
-                    benchmarks.append((output_path, platform, data_type, bench_op, element_wise_args, bench_count))
-            if Op.MATMUL in bench_ops:
-                benchmarks.append((output_path, platform, data_type, Op.MATMUL, matmul_args, bench_count))
+        if benchmarks:
+            for benchmark in benchmarks:
+                process = mp.Process(target=run_benchmark, args=benchmark)
+                process.start()
+                process.join()
+        print('Benchmark done')

-    for benchmark in benchmarks:
-        process = mp.Process(target=run_benchmark, args=benchmark)
-        process.start()
-        process.join()
-
-    print('Benchmark done')
+    if not no_compare:
+        compare(output_path)
+        print('Compare done')


 if __name__ == '__main__':
--- a/config/benchmark.py
+++ b/config/benchmark.py
@ -0,0 +1,41 @@
+class Config:
+    EXPERIMENT_TIME = 1.0
+    ELEMENT_WISE_ARGS = [
+        (100, 100),
+        (100, 200),
+        (128, 128),
+        (200, 100),
+        (200, 200),
+        (256, 256),
+        (256, 512),
+        (512, 256),
+        (400, 400),
+        (512, 512),
+        (800, 800),
+        (1024, 1024),
+        (1800, 1800)]
+    MATMUL_ARGS = [
+        ((100, 100), (100, 100)),
+        ((100, 200), (200, 100)),
+        ((128, 128), (128, 128)),
+        ((200, 100), (100, 200)),
+        ((200, 200), (200, 200)),
+        ((256, 256), (256, 256)),
+        ((256, 512), (512, 256)),
+        ((400, 400), (400, 400)),
+        ((512, 256), (256, 512)),
+        ((512, 512), (512, 512)),
+        ((800, 800), (800, 800)),
+        ((1000, 1000), (1000, 1000)),
+        ((1200, 1200), (1200, 1200))]
+    NN_1D_ARGS = [
+        (1, 16), (16, 16), (64, 16),
+        (1, 64), (16, 64),
+        (1, 150), (16, 150),
+        (1, 256), (16, 256),
+        (1, 400), (16, 400), (64, 400),
+        (1, 512), (16, 512), (64, 512),
+        (1, 800), (16, 800), (64, 800),
+        (1, 1024), (16, 1024),
+        (1, 2000), (16, 2000), (64, 2000),
+        (1, 4000), (16, 4000), (64, 4000)]
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,4 @@
+matplotlib
+seaborn
+tensorflow
+torch
--- a/src/base.py
+++ b/src/base.py
@ -1,69 +1,71 @@
 from pathlib import Path
 import time
+from typing import List

 import numpy as np
 import pandas as pd

+from config.benchmark import Config
 from src.common import DataKey, DataType, Device, Op, Platform
-from src.plot import plot_experiments
+from src.op_info import op_infos
 from src.utils import get_cpu_name, get_nvidia_name


 class BenchBase():
-    def __init__(self, output_path: Path, platform: Platform, bench_op: Op, device_type: Device, device):
+    def __init__(self, output_path: Path, platform: Platform, bench_op: Op,
+                 device_type: Device, device,
+                 data_type: DataType, dtype):
        self._base_output_path = output_path
-        self.output_path = output_path

        self.platform = platform
        self.bench_op = bench_op
        self.device_type = device_type
        self.device = device
-        self.dtype = None
+        self.device_name = get_cpu_name() if self.device_type == Device.CPU else get_nvidia_name()
+        self.data_type = data_type
+        self.dtype = dtype
+        self.info = op_infos[bench_op]

-    def set_output_path(self, device: Device, device_name: str):
        self.output_path = (
-            self._base_output_path / f'{device.value}_{device_name}' / self.platform.value / self.bench_op.value)
+            self._base_output_path / f'{self.device_type.value}_{self.device_name}'
+            / self.platform.value / self.bench_op.value)  # noqa

-    def get_dtype(self, data_type: DataType):
+    def pre_experiment(self, _experiment_args):
+        pass
+
+    def experiment(self):
        raise NotImplementedError()

-    def experiment(self, _experiment_args, _length, _dtype, _device):
-        raise NotImplementedError()
-
-    def name(self, _experiment_args) -> str:
-        raise NotImplementedError()
-
-    def mop(self, _experiment_args) -> float:
-        raise NotImplementedError()
-
-    def run(self, experiment_args, experiment_count: int, data_type: DataType):
-        self.set_output_path(self.device_type, get_cpu_name() if self.device_type == Device.CPU else get_nvidia_name())
+    def post_experiment(self):
+        pass

+    def run(self, experiment_args, experiment_count: int):
        if not self.output_path.exists():
            self.output_path.mkdir(parents=True)

-        dtype = self.get_dtype(data_type)
-
-        print(f'Starting {self.platform.value}\'s {self.bench_op.value} benchmark with data type: {data_type.value}')
+        print(f'Starting {self.platform.value}\'s {self.bench_op.value} benchmark'
+              f' with data type: {self.data_type.value}')

        experiment_names = []
        experiment_lengths = []
        experiment_times = []
        experiment_mop = []
        for args in experiment_args:
+            self.pre_experiment(args)
+
            # warmup
-            for _ in range(4):
-                self.experiment(args, 5, dtype, self.device)
+            for _ in range(20):
+                self.experiment()

            # speed evalutaion
            counter = 0
            start_time = time.time()
-            while time.time() - start_time < 0.2:
-                self.experiment(args, 10, dtype, self.device)
-                counter += 10
+            while (time.time() - start_time) < (Config.EXPERIMENT_TIME / 5):
+                self.experiment()
+                counter += 1
            end_time = time.time()

-            target_time = 1.0  # in s
+            target_time = Config.EXPERIMENT_TIME  # in s
            experiment_speed = counter / (end_time - start_time)  # in op/s
            experiment_length = max(int(target_time / experiment_count * experiment_speed), 2)
            # print(f'Evaluated {counter} {self.bench_op.value} in {end_time - start_time:0.3f}s'
@ -73,24 +75,28 @@ class BenchBase():
            run_times = []
            for _ in range(experiment_count):
                start_time = time.time()
-                self.experiment(args, experiment_length, dtype, self.device)
+                for _ in range(experiment_length):
+                    self.experiment()
                run_times.append(time.time() - start_time)
            experiment_times += run_times
-            experiment_names += [self.name(args)] * experiment_count
+            experiment_names += [self.info.name(args)] * experiment_count
            experiment_lengths += [experiment_length] * experiment_count
-            experiment_mop += [self.mop(args)] * experiment_count
+            experiment_mop += [self.info.mop(args)] * experiment_count

            total_time = np.array(run_times, dtype=np.float64).sum()
-            total_glop = self.mop(args) * experiment_length * experiment_count / 1000
+            total_glop = self.info.mop(args) * experiment_length * experiment_count / 1000
            print(f'Run {experiment_names[-1]} (x{experiment_length})'
                  f' in {total_time:0.2f}s => {total_glop / total_time:0.3f}GFOPS')
+            self.post_experiment()

-        data = self.save_experiments(experiment_names, experiment_times, experiment_lengths, experiment_mop, data_type)
-        plot_experiments(self.output_path, data, data_type, self.bench_op, self.platform)
+        data = self.save_experiments(experiment_names, experiment_times, experiment_lengths, experiment_mop)
+        # Avoid circular import
+        from src.plot import plot_experiments  # pylint: disable=import-outside-toplevel
+        plot_experiments(self, data)

    def save_experiments(
-            self, experiment_names: list[str], experiment_times: list[float],
-            experiment_lengths: list[int], experiment_mop: list[float], data_type: DataType) -> pd.DataFrame:
+            self, experiment_names: List[str], experiment_times: List[float],
+            experiment_lengths: List[int], experiment_mop: List[float]) -> pd.DataFrame:
        key = DataKey(self.bench_op)
        data = pd.DataFrame(
            {
@ -102,5 +108,5 @@ class BenchBase():
                key.gflops: [(mop * l) / (t * 1000.0)
                             for mop, l, t in zip(experiment_mop, experiment_lengths, experiment_times)]
            })
-        data.to_csv(self.output_path / f'{self.bench_op.value}_{data_type.value}.csv', sep='\t')
+        data.to_csv(self.output_path / f'{self.bench_op.value}_{self.data_type.value}.csv', sep='\t')
        return data
--- a/src/common.py
+++ b/src/common.py
@ -13,16 +13,19 @@ class DataType(Enum):


 class Op(Enum):
-    NO_OP = 'noop'
    ADD = 'add'
    DIV = 'div'
    MUL = 'mul'
    MATMUL = 'matmul'
+    NN_MATMUL = 'nn_matmul'
+    NN_DENSE = 'nn_dense'


 class Platform(Enum):
-    TF1 = 'TF1'
+    JAX = 'jax'
+    # TF1 = 'TF1'
    TF2 = 'TF2'
+    TF2_V1 = 'TF2_V1'
    TORCH = 'Torch'


--- a/src/jax/add.py
+++ b/src/jax/add.py
@ -0,0 +1,25 @@
+from pathlib import Path
+from typing import Tuple
+
+from jax import device_put
+import jax.numpy as jnp
+
+from src.common import DataType, Op
+from src.jax.base import JaxBase
+
+
+class JaxAddBench(JaxBase):
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.ADD, data_type)
+        self.tensor_1: jnp.DeviceArray = None
+        self.tensor_2: jnp.DeviceArray = None
+        self.tensor_result: jnp.DeviceArray = None
+
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
+        shape_1 = experiment_args
+        self.tensor_1 = device_put(jnp.ones(shape_1, dtype=self.dtype))
+        self.tensor_2 = device_put(jnp.ones(shape_1, dtype=self.dtype))
+        self.tensor_result = jnp.add(self.tensor_1, self.tensor_2).block_until_ready()
+
+    def experiment(self):
+        self.tensor_result = jnp.add(self.tensor_1, self.tensor_2).block_until_ready()
--- a/src/jax/base.py
+++ b/src/jax/base.py
@ -0,0 +1,34 @@
+from pathlib import Path
+
+import jax.numpy as jnp
+import jax
+
+from src.base import BenchBase
+from src.common import DataType, Device, Op, Platform
+
+
+class JaxBase(BenchBase):
+    def __init__(self, output_path: Path, bench_op: Op, data_type: DataType):
+        gpu_devices = jax.devices('gpu')
+        if gpu_devices:
+            if len(gpu_devices) > 1:
+                print('WARINING : no multiple CUDA device benchmark implemented yet (only using first)')
+            device_type = Device.GPU
+            device = gpu_devices[0]
+        else:
+            device_type = Device.CPU
+            device = jax.devices('cpu')[0]
+
+        if data_type == DataType.FLOAT16:
+            dtype = jnp.float16
+        elif data_type == DataType.FLOAT32:
+            dtype = jnp.float32
+        elif data_type == DataType.FLOAT64:
+            dtype = jnp.float64
+        else:
+            raise NotImplementedError(f'data_type {data_type.value} not implemented')
+
+        super().__init__(output_path, Platform.JAX, bench_op, device_type, device, data_type, dtype)
+
+    def experiment(self):
+        raise NotImplementedError()
--- a/src/jax/div.py
+++ b/src/jax/div.py
@ -0,0 +1,25 @@
+from pathlib import Path
+from typing import Tuple
+
+from jax import device_put
+import jax.numpy as jnp
+
+from src.common import DataType, Op
+from src.jax.base import JaxBase
+
+
+class JaxDivBench(JaxBase):
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.DIV, data_type)
+        self.tensor_1: jnp.DeviceArray = None
+        self.tensor_2: jnp.DeviceArray = None
+        self.tensor_result: jnp.DeviceArray = None
+
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
+        shape_1 = experiment_args
+        self.tensor_1 = device_put(jnp.ones(shape_1, dtype=self.dtype))
+        self.tensor_2 = device_put(jnp.ones(shape_1, dtype=self.dtype))
+        self.tensor_result = jnp.divide(self.tensor_1, self.tensor_2).block_until_ready()
+
+    def experiment(self):
+        self.tensor_result = jnp.divide(self.tensor_1, self.tensor_2).block_until_ready()
--- a/src/jax/matmul.py
+++ b/src/jax/matmul.py
@ -0,0 +1,28 @@
+from pathlib import Path
+from typing import List, Tuple
+
+from jax import device_put
+import jax.numpy as jnp
+
+from src.common import DataType, Op
+from src.jax.base import JaxBase
+
+
+class JaxMatmulBench(JaxBase):
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.MATMUL, data_type)
+        self.tensor_1: jnp.DeviceArray = None
+        self.tensor_2: jnp.DeviceArray = None
+        self.tensor_result: jnp.DeviceArray = None
+
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
+        shape_1, shape_2 = experiment_args
+        self.tensor_1 = device_put(jnp.ones(shape_1, dtype=self.dtype))
+        self.tensor_2 = device_put(jnp.ones(shape_2, dtype=self.dtype))
+        self.tensor_result = jnp.matmul(self.tensor_1, self.tensor_2).block_until_ready()
+
+    def experiment(self):
+        self.tensor_result = jnp.matmul(self.tensor_1, self.tensor_2).block_until_ready()
+
+    def run(self, experiment_args: List[Tuple[Tuple[int, int], Tuple[int, int]]], experiment_count: int):
+        super().run(experiment_args, experiment_count)
--- a/src/jax/mul.py
+++ b/src/jax/mul.py
@ -0,0 +1,25 @@
+from pathlib import Path
+from typing import Tuple
+
+from jax import device_put
+import jax.numpy as jnp
+
+from src.common import DataType, Op
+from src.jax.base import JaxBase
+
+
+class JaxMulBench(JaxBase):
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.MUL, data_type)
+        self.tensor_1: jnp.DeviceArray = None
+        self.tensor_2: jnp.DeviceArray = None
+        self.tensor_result: jnp.DeviceArray = None
+
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
+        shape_1 = experiment_args
+        self.tensor_1 = device_put(jnp.ones(shape_1, dtype=self.dtype))
+        self.tensor_2 = device_put(jnp.ones(shape_1, dtype=self.dtype))
+        self.tensor_result = jnp.multiply(self.tensor_1, self.tensor_2).block_until_ready()
+
+    def experiment(self):
+        self.tensor_result = jnp.multiply(self.tensor_1, self.tensor_2).block_until_ready()
--- a/src/jax/nn_dense.py
+++ b/src/jax/nn_dense.py
@ -0,0 +1,32 @@
+from pathlib import Path
+from typing import Callable, List, Tuple
+
+from jax import device_put, jit, random
+from jax.experimental import stax
+import jax.numpy as jnp
+
+from src.common import DataType, Op
+from src.jax.base import JaxBase
+
+
+class JaxNNDenseBench(JaxBase):
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.NN_DENSE, data_type)
+        self.tensor: jnp.DeviceArray = None
+        self.tensor_result: jnp.DeviceArray = None
+        self.network: Callable = None
+        self.params = None
+
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
+        batch_size, dimension = experiment_args
+        self.tensor = device_put(jnp.ones((batch_size, dimension), dtype=self.dtype))
+        network_init, self.network = stax.Dense(dimension)
+        _, self.params = network_init(random.PRNGKey(1), (batch_size, dimension))
+        self.network = jit(self.network)
+        self.tensor_result = self.network(self.params, self.tensor)
+
+    def experiment(self):
+        self.tensor_result = self.network(self.params, self.tensor)
+
+    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
+        super().run(experiment_args, experiment_count)
--- a/src/jax/nn_matmul.py
+++ b/src/jax/nn_matmul.py
@ -0,0 +1,33 @@
+from pathlib import Path
+from typing import List, Tuple
+
+from jax import device_put, jit
+import jax.numpy as jnp
+
+from src.common import DataType, Op
+from src.jax.base import JaxBase
+
+
+def matmul(tensor_1: jnp.DeviceArray, tensor_2: jnp.DeviceArray) -> jnp.DeviceArray:
+    return tensor_1 @ tensor_2
+
+
+class JaxNNMatmulBench(JaxBase):
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.NN_MATMUL, data_type)
+        self.tensor_1: jnp.DeviceArray = None
+        self.tensor_2: jnp.DeviceArray = None
+        self.tensor_result: jnp.DeviceArray = None
+        self.network = jit(matmul)
+
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
+        shape_1, shape_2 = experiment_args
+        self.tensor_1 = device_put(jnp.ones(shape_1, dtype=self.dtype))
+        self.tensor_2 = device_put(jnp.ones(shape_2, dtype=self.dtype))
+        self.tensor_result = self.network(self.tensor_1, self.tensor_2)
+
+    def experiment(self):
+        self.tensor_result = self.network(self.tensor_1, self.tensor_2)
+
+    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
+        super().run(experiment_args, experiment_count)
--- a/src/jax/ops.py
+++ b/src/jax/ops.py
@ -0,0 +1,20 @@
+from typing import Type
+
+from src.common import Op
+from src.jax.add import JaxAddBench
+from src.jax.base import JaxBase
+from src.jax.div import JaxDivBench
+from src.jax.mul import JaxMulBench
+from src.jax.matmul import JaxMatmulBench
+from src.jax.nn_dense import JaxNNDenseBench
+from src.jax.nn_matmul import JaxNNMatmulBench
+
+
+jax_ops: dict[Op, Type[JaxBase]] = {
+    Op.ADD: JaxAddBench,
+    Op.MUL: JaxMulBench,
+    Op.DIV: JaxDivBench,
+    Op.MATMUL: JaxMatmulBench,
+    Op.NN_MATMUL: JaxNNMatmulBench,
+    Op.NN_DENSE: JaxNNDenseBench
+}
--- a/src/op_info.py
+++ b/src/op_info.py
@ -0,0 +1,85 @@
+from typing import Dict, List, Type, Tuple
+
+from src.common import Op
+
+
+class _BaseInfo():
+    @staticmethod
+    def name(experiment_args) -> str:
+        raise NotImplementedError()
+
+    @staticmethod
+    def mop(experiment_args) -> float:
+        raise NotImplementedError()
+
+
+class AddInfo(_BaseInfo):
+    @staticmethod
+    def name(experiment_args: Tuple[int, int]) -> str:
+        shape_1 = experiment_args
+        return f'{shape_1[0]}x{shape_1[1]} + {shape_1[0]}x{shape_1[1]}'
+
+    @staticmethod
+    def mop(experiment_args: Tuple[int, int]) -> float:
+        shape_1 = experiment_args
+        return shape_1[0] * shape_1[1] / 1_000_000
+
+
+class DivInfo(_BaseInfo):
+    @staticmethod
+    def name(experiment_args: Tuple[int, int]) -> str:
+        shape_1 = experiment_args
+        return f'{shape_1[0]}x{shape_1[1]} / {shape_1[0]}x{shape_1[1]}'
+
+    @staticmethod
+    def mop(experiment_args: Tuple[int, int]) -> float:
+        shape_1 = experiment_args
+        return shape_1[0] * shape_1[1] / 1_000_000
+
+
+class MulInfo(_BaseInfo):
+    @staticmethod
+    def name(experiment_args: Tuple[int, int]) -> str:
+        shape_1 = experiment_args
+        return f'{shape_1[0]}x{shape_1[1]} * {shape_1[0]}x{shape_1[1]}'
+
+    @staticmethod
+    def mop(experiment_args: Tuple[int, int]) -> float:
+        shape_1 = experiment_args
+        return shape_1[0] * shape_1[1] / 1_000_000
+
+
+class MatmulInfo(_BaseInfo):
+    @staticmethod
+    def name(experiment_args: List[Tuple[Tuple[int, int], Tuple[int, int]]]) -> str:
+        shape_1, shape_2 = experiment_args
+        return f'{shape_1[0]}x{shape_1[1]} @ {shape_2[0]}x{shape_2[1]}'
+
+    @staticmethod
+    def mop(experiment_args: List[Tuple[Tuple[int, int], Tuple[int, int]]]) -> float:
+        shape_1, shape_2 = experiment_args
+        return (shape_1[0] * shape_2[1] / 1_000_000) * 2 * (shape_1[1] - 1)
+
+
+class DenseInfo(_BaseInfo):
+    @staticmethod
+    def name(experiment_args: Tuple[int, int]) -> str:
+        batch_size, dimension = experiment_args
+        return f'Dense(({batch_size}x{dimension}))'
+
+    @staticmethod
+    def mop(experiment_args: Tuple[int, int]) -> float:
+        batch_size, dimension = experiment_args
+        return batch_size * (
+            ((dimension * dimension / 1_000_000) * 2 * (dimension - 1)) + (
+                dimension / 1_000_000))
+
+
+op_infos: Dict[Op, Type[_BaseInfo]] = {
+    Op.ADD: AddInfo,
+    Op.DIV: DivInfo,
+    Op.MUL: MulInfo,
+    Op.MATMUL: MatmulInfo,
+    Op.NN_MATMUL: MatmulInfo,
+    Op.NN_DENSE: DenseInfo
+}
--- a/src/plot.py
+++ b/src/plot.py
@ -1,4 +1,7 @@
 from pathlib import Path
+import math
+import multiprocessing as mp
+import os

 import numpy as np
 import matplotlib.pyplot as plt
@ -6,11 +9,20 @@ import pandas as pd
 import seaborn as sns


+from src.base import BenchBase
 from src.common import DataKey, DataType, Op, Platform


-def plot_experiments(output_path: Path, data: pd.DataFrame, data_type: DataType, bench_op: Op, platform: Platform):
-    key = DataKey(bench_op)
+class CompKey:
+    def __init__(self):
+        self.data_type = 'data_type'
+        self.device = 'device'
+        self.bench_op = 'op'
+        self.platform = 'platform'
+
+
+def plot_experiments(bench: BenchBase, data: pd.DataFrame):
+    key = DataKey(bench.bench_op)
    sum_data = data[[key.experiment, key.time, key.count]].groupby(
        key.experiment, as_index=False, sort=False).sum()
    mean_data = data[[key.experiment, key.speed]].groupby(
@ -20,12 +32,12 @@ def plot_experiments(output_path: Path, data: pd.DataFrame, data_type: DataType,

    sns.set_theme(style="ticks")
    figure, axes = plt.subplots(nrows=3, sharex=True, figsize=(18, 12))
-    figure.suptitle(f'{platform.value} {bench_op.value} ({data_type.value})', fontsize=16)
    for axe in axes[:-1]:
        axe.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)

    chart = sns.barplot(x=key.experiment, y=key.mop, data=max_data, ax=axes[0], order=data[key.experiment].unique())
-    axes[0].set_yscale("log")
+    if max_data[key.mop].max() > max_data[key.mop].min() * 100:
+        axes[0].set_yscale("log")
    for patch, value in zip(chart.patches, max_data[key.mop]):
        chart.annotate(f'{value:0.3f}',
                       (patch.get_x() + patch.get_width() / 2.0, patch.get_height()),
@ -33,6 +45,8 @@ def plot_experiments(output_path: Path, data: pd.DataFrame, data_type: DataType,
                       textcoords='offset points')

    chart = sns.barplot(x=key.experiment, y=key.speed, data=data, estimator=np.median, ax=axes[1])
+    if data[key.speed].max() > data[key.speed].min() * 100:
+        axes[1].set_yscale("log")
    for patch, value in zip(chart.patches, mean_data[key.speed]):
        chart.annotate(f'{value:.3f}',
                       (patch.get_x() + patch.get_width() / 2.0, patch.get_height()),
@ -40,6 +54,8 @@ def plot_experiments(output_path: Path, data: pd.DataFrame, data_type: DataType,
                       textcoords='offset points')

    chart = sns.barplot(x=key.experiment, y=key.gflops, data=data, estimator=np.median, ax=axes[2])
+    if data[key.gflops].max() > data[key.gflops].min() * 100:
+        axes[2].set_yscale("log")
    for patch, mop, count, value in zip(chart.patches, max_data[key.mop], sum_data[key.count], sum_data[key.time]):
        chart.annotate(f'{(mop * count / 1000) / value:.3f}',
                       (patch.get_x() + patch.get_width() / 2.0, patch.get_height()),
@ -47,5 +63,62 @@ def plot_experiments(output_path: Path, data: pd.DataFrame, data_type: DataType,
                       textcoords='offset points')

    plt.xticks(rotation=20)
-    plt.subplots_adjust(hspace=0.0, wspace=0.02, top=0.93, right=0.99, bottom=0.1, left=0.05)
-    plt.savefig(output_path / f'{bench_op.value}_{data_type.value}.png')
+    plt.subplots_adjust(hspace=0.0, wspace=0.02, top=0.91, right=0.99, bottom=0.1, left=0.05)
+    figure.suptitle(f'{bench.platform.value} {bench.bench_op.value} ({bench.data_type.value})', fontsize=16)
+    axes[0].set_title(f'{bench.device_name}', fontsize=12)
+    plt.savefig(bench.output_path / f'{bench.bench_op.value}_{bench.data_type.value}.png')
+
+
+def _draw_comparison(all_data: pd.DataFrame, comp_key: CompKey, device: str, bench_op: str, output_path: Path):
+    op_data = all_data[(all_data[comp_key.bench_op] == bench_op) & (all_data[comp_key.device] == device)]
+    platform_list = op_data[comp_key.platform].unique()
+    if len(platform_list) <= 1:
+        return
+
+    key = DataKey(Op(bench_op))
+
+    sns.set_theme(style="ticks")
+    for data_type in op_data[comp_key.data_type].unique():
+        data = op_data[op_data[comp_key.data_type] == data_type]
+        graph = sns.catplot(x=key.experiment, y=key.gflops, hue=comp_key.platform, data=data,
+                            kind='bar', estimator=np.median, height=8, aspect=1.4)
+        if data[key.gflops].max() > data[key.gflops].min() * 100:
+            graph.set(yscale="log")
+        plt.xticks(rotation=70, fontsize=8)
+        plt.subplots_adjust(top=0.92, bottom=0.25)
+        plt.suptitle('/'.join(platform_list) + f' {bench_op} ({data_type})', fontsize=16)
+        plt.title(f'{device}', fontsize=12)
+        plt.savefig(output_path / device / f'{bench_op}_{data_type}.png')
+
+
+def compare(output_path: Path):
+    all_data: pd.DataFrame = None
+    comp_key = CompKey()
+
+    for data_path in output_path.rglob('*.csv'):
+        if len(data_path.parents) <= 4:
+            print(f'Warning: cannot parse data at path {data_path} (subfolders missing)')
+        data_type = DataType(data_path.stem.split('_')[-1])
+        bench_op = Op(data_path.parents[0].name)
+        platform = Platform(data_path.parents[1].name)
+        device_name = data_path.parents[2].name
+
+        current_data = pd.read_csv(data_path, sep='\t')
+        current_data[comp_key.data_type] = data_type.value
+        current_data[comp_key.bench_op] = bench_op.value
+        current_data[comp_key.platform] = platform.value
+        current_data[comp_key.device] = device_name
+
+        if all_data is None:
+            all_data = current_data
+        else:
+            all_data = all_data.append(current_data, ignore_index=True, verify_integrity=True)
+
+    # Compare between platforms
+    comp_args = []
+    for device in all_data[comp_key.device].unique():
+        for bench_op in all_data[comp_key.bench_op].unique():
+            comp_args.append((all_data, comp_key, device, bench_op, output_path))
+
+    with mp.Pool(processes=math.ceil(os.cpu_count() * 0.8)) as pool:
+        pool.starmap(_draw_comparison, comp_args)
--- a/src/pytorch/add.py
+++ b/src/pytorch/add.py
@ -1,4 +1,5 @@
 from pathlib import Path
+from typing import List, Tuple

 import torch

@ -7,27 +8,20 @@ from src.pytorch.base import TorchBase


 class TorchAddBench(TorchBase):
-    def __init__(self, output_path: Path):
-        super().__init__(output_path, Op.ADD)
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.ADD, data_type)
+        self.tensor_1: torch.Tensor = None
+        self.tensor_2: torch.Tensor = None
+        self.tensor_result: torch.Tensor = None

-    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: torch.dtype, device: torch.device):
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1 = experiment_args
-        tensor_1 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
-        tensor_2 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
+        self.tensor_1 = torch.ones(shape_1, dtype=self.dtype, device=self.device, requires_grad=False)
+        self.tensor_2 = torch.ones(shape_1, dtype=self.dtype, device=self.device, requires_grad=False)
+        self.tensor_result = self.tensor_1 + self.tensor_2

-        for _ in range(length):
-            _ = tensor_1 + tensor_2
+    def experiment(self):
+        self.tensor_result = self.tensor_1 + self.tensor_2

-    def name(self, experiment_args: tuple[int, int]) -> str:
-        shape_1 = experiment_args
-        return f'{shape_1[0]}x{shape_1[1]} + {shape_1[0]}x{shape_1[1]}'
-
-    def mop(self, experiment_args: tuple[int, int]) -> float:
-        shape_1 = experiment_args
-        return shape_1[0] * shape_1[1] / 1000_000
-
-    def run(self,
-            experiment_args: list[tuple[int, int]],
-            experiment_count: int,
-            data_type: DataType):
-        super().run(experiment_args, experiment_count, data_type)
+    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
+        super().run(experiment_args, experiment_count)
--- a/src/pytorch/base.py
+++ b/src/pytorch/base.py
@ -7,7 +7,7 @@ from src.common import DataType, Device, Op, Platform


 class TorchBase(BenchBase):
-    def __init__(self, output_path: Path, bench_op: Op):
+    def __init__(self, output_path: Path, bench_op: Op, data_type: DataType):
        if torch.cuda.is_available():
            if torch.cuda.device_count() > 1:
                print('WARINING : no multiple CUDA device benchmark implemented yet (only using first)')
@ -18,22 +18,16 @@ class TorchBase(BenchBase):
            device_type = Device.CPU
            device = torch.device('cpu')

-        super().__init__(output_path, Platform.TORCH, bench_op, device_type, device)
-
-    def get_dtype(self, data_type: DataType) -> torch.dtype:
        if data_type == DataType.FLOAT16:
-            return torch.float16
-        if data_type == DataType.FLOAT32:
-            return torch.float32
-        if data_type == DataType.FLOAT64:
-            return torch.float64
-        raise NotImplementedError(f'data_type {data_type.value} not implemented')
+            dtype = torch.float16
+        elif data_type == DataType.FLOAT32:
+            dtype = torch.float32
+        elif data_type == DataType.FLOAT64:
+            dtype = torch.float64
+        else:
+            raise NotImplementedError(f'data_type {data_type.value} not implemented')

-    def experiment(self, _experiment_args, _length, _dtype, _device):
-        raise NotImplementedError()
+        super().__init__(output_path, Platform.TORCH, bench_op, device_type, device, data_type, dtype)

-    def name(self, _experiment_args) -> str:
-        raise NotImplementedError()
-
-    def mop(self, _experiment_args) -> float:
+    def experiment(self):
        raise NotImplementedError()
--- a/src/pytorch/div.py
+++ b/src/pytorch/div.py
@ -1,4 +1,5 @@
 from pathlib import Path
+from typing import List, Tuple

 import torch

@ -7,27 +8,20 @@ from src.pytorch.base import TorchBase


 class TorchDivBench(TorchBase):
-    def __init__(self, output_path: Path):
-        super().__init__(output_path, Op.DIV)
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.DIV, data_type)
+        self.tensor_1: torch.Tensor = None
+        self.tensor_2: torch.Tensor = None
+        self.tensor_result: torch.Tensor = None

-    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: torch.dtype, device: torch.device):
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1 = experiment_args
-        tensor_1 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
-        tensor_2 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
+        self.tensor_1 = torch.ones(shape_1, dtype=self.dtype, device=self.device, requires_grad=False)
+        self.tensor_2 = torch.ones(shape_1, dtype=self.dtype, device=self.device, requires_grad=False)
+        self.tensor_result = self.tensor_1 / self.tensor_2

-        for _ in range(length):
-            _ = tensor_1 / tensor_2
+    def experiment(self):
+        self.tensor_result = self.tensor_1 / self.tensor_2

-    def name(self, experiment_args: tuple[int, int]) -> str:
-        shape_1 = experiment_args
-        return f'{shape_1[0]}x{shape_1[1]} / {shape_1[0]}x{shape_1[1]}'
-
-    def mop(self, experiment_args: tuple[int, int]) -> float:
-        shape_1 = experiment_args
-        return shape_1[0] * shape_1[1] / 1000_000
-
-    def run(self,
-            experiment_args: list[tuple[int, int]],
-            experiment_count: int,
-            data_type: DataType):
-        super().run(experiment_args, experiment_count, data_type)
+    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
+        super().run(experiment_args, experiment_count)
--- a/src/pytorch/matmul.py
+++ b/src/pytorch/matmul.py
@ -1,4 +1,5 @@
 from pathlib import Path
+from typing import List, Tuple

 import torch

@ -7,27 +8,20 @@ from src.pytorch.base import TorchBase


 class TorchMatmulBench(TorchBase):
-    def __init__(self, output_path: Path):
-        super().__init__(output_path, Op.MATMUL)
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.MATMUL, data_type)
+        self.tensor_1: torch.Tensor = None
+        self.tensor_2: torch.Tensor = None
+        self.tensor_result: torch.Tensor = None

-    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: torch.dtype, device: torch.device):
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1, shape_2 = experiment_args
-        tensor_1 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
-        tensor_2 = torch.ones(shape_2, dtype=dtype, device=device, requires_grad=False)
+        self.tensor_1 = torch.ones(shape_1, dtype=self.dtype, device=self.device, requires_grad=False)
+        self.tensor_2 = torch.ones(shape_2, dtype=self.dtype, device=self.device, requires_grad=False)
+        self.tensor_result = self.tensor_1 @ self.tensor_2

-        for _ in range(length):
-            _ = tensor_1 @ tensor_2
+    def experiment(self):
+        self.tensor_result = self.tensor_1 @ self.tensor_2

-    def name(self, experiment_args: tuple[int, int]) -> str:
-        shape_1, shape_2 = experiment_args
-        return f'{shape_1[0]}x{shape_1[1]} @ {shape_2[0]}x{shape_2[1]}'
-
-    def mop(self, experiment_args: tuple[int, int]) -> float:
-        shape_1, shape_2 = experiment_args
-        return (shape_1[0] * shape_2[1] / 1000_000) * 2 * (shape_1[1] - 1)
-
-    def run(self,
-            experiment_args: list[tuple[tuple[int, int], tuple[int, int]]],
-            experiment_count: int,
-            data_type: DataType):
-        super().run(experiment_args, experiment_count, data_type)
+    def run(self, experiment_args: List[Tuple[Tuple[int, int], Tuple[int, int]]], experiment_count: int):
+        super().run(experiment_args, experiment_count)
--- a/src/pytorch/mul.py
+++ b/src/pytorch/mul.py
@ -1,4 +1,5 @@
 from pathlib import Path
+from typing import List, Tuple

 import torch

@ -7,27 +8,20 @@ from src.pytorch.base import TorchBase


 class TorchMulBench(TorchBase):
-    def __init__(self, output_path: Path):
-        super().__init__(output_path, Op.MUL)
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.MUL, data_type)
+        self.tensor_1: torch.Tensor = None
+        self.tensor_2: torch.Tensor = None
+        self.tensor_result: torch.Tensor = None

-    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: torch.dtype, device: torch.device):
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1 = experiment_args
-        tensor_1 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
-        tensor_2 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
+        self.tensor_1 = torch.ones(shape_1, dtype=self.dtype, device=self.device, requires_grad=False)
+        self.tensor_2 = torch.ones(shape_1, dtype=self.dtype, device=self.device, requires_grad=False)
+        self.tensor_result = self.tensor_1 * self.tensor_2

-        for _ in range(length):
-            _ = tensor_1 * tensor_2
+    def experiment(self):
+        self.tensor_result = self.tensor_1 * self.tensor_2

-    def name(self, experiment_args: tuple[int, int]) -> str:
-        shape_1 = experiment_args
-        return f'{shape_1[0]}x{shape_1[1]} * {shape_1[0]}x{shape_1[1]}'
-
-    def mop(self, experiment_args: tuple[int, int]) -> float:
-        shape_1 = experiment_args
-        return shape_1[0] * shape_1[1] / 1000_000
-
-    def run(self,
-            experiment_args: list[tuple[int, int]],
-            experiment_count: int,
-            data_type: DataType):
-        super().run(experiment_args, experiment_count, data_type)
+    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
+        super().run(experiment_args, experiment_count)
--- a/src/pytorch/nn_dense.py
+++ b/src/pytorch/nn_dense.py
@ -0,0 +1,36 @@
+from pathlib import Path
+from typing import List, Tuple
+
+import torch
+
+from src.common import DataType, Op
+from src.pytorch.base import TorchBase
+
+
+class DenseNetwork(torch.nn.Module):
+    def __init__(self, input_dim: int, dtype: torch.dtype):
+        super().__init__()
+        self.dense = torch.nn.Linear(input_dim, input_dim, dtype=dtype)
+
+    def forward(self, input_data: torch.Tensor) -> torch.Tensor:
+        return self.dense(input_data)
+
+
+class TorchNNDenseBench(TorchBase):
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.NN_DENSE, data_type)
+        self.tensor: torch.Tensor = None
+        self.tensor_result: torch.Tensor = None
+        self.network: torch.nn.Module = None
+
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
+        batch_size, dimension = experiment_args
+        self.tensor = torch.ones((batch_size, dimension), dtype=self.dtype, device=self.device, requires_grad=False)
+        self.network = DenseNetwork(dimension, self.dtype).to(self.device)
+        self.tensor_result = self.network(self.tensor)
+
+    def experiment(self):
+        self.tensor_result = self.network(self.tensor)
+
+    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
+        super().run(experiment_args, experiment_count)
--- a/src/pytorch/nn_matmul.py
+++ b/src/pytorch/nn_matmul.py
@ -0,0 +1,34 @@
+from pathlib import Path
+from typing import List, Tuple
+
+import torch
+
+from src.common import DataType, Op
+from src.pytorch.base import TorchBase
+
+
+class MatMulNetwork(torch.nn.Module):
+    def forward(self, input_1: torch.Tensor, input_2: torch.Tensor) -> torch.Tensor:
+        return input_1 @ input_2
+
+
+class TorchNNMatmulBench(TorchBase):
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.NN_MATMUL, data_type)
+        self.tensor_1: torch.Tensor = None
+        self.tensor_2: torch.Tensor = None
+        self.tensor_result: torch.Tensor = None
+        self.network: torch.nn.Module = None
+
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
+        shape_1, shape_2 = experiment_args
+        self.tensor_1 = torch.ones(shape_1, dtype=self.dtype, device=self.device, requires_grad=False)
+        self.tensor_2 = torch.ones(shape_2, dtype=self.dtype, device=self.device, requires_grad=False)
+        self.network = MatMulNetwork()
+        self.tensor_result = self.network(self.tensor_1, self.tensor_2)
+
+    def experiment(self):
+        self.tensor_result = self.network(self.tensor_1, self.tensor_2)
+
+    def run(self, experiment_args: List[Tuple[Tuple[int, int], Tuple[int, int]]], experiment_count: int):
+        super().run(experiment_args, experiment_count)
--- a/src/pytorch/ops.py
+++ b/src/pytorch/ops.py
@ -6,11 +6,15 @@ from src.pytorch.base import TorchBase
 from src.pytorch.div import TorchDivBench
 from src.pytorch.mul import TorchMulBench
 from src.pytorch.matmul import TorchMatmulBench
+from src.pytorch.nn_dense import TorchNNDenseBench
+from src.pytorch.nn_matmul import TorchNNMatmulBench


 torch_ops: dict[Op, Type[TorchBase]] = {
    Op.ADD: TorchAddBench,
    Op.MUL: TorchMulBench,
    Op.DIV: TorchDivBench,
-    Op.MATMUL: TorchMatmulBench
+    Op.MATMUL: TorchMatmulBench,
+    Op.NN_MATMUL: TorchNNMatmulBench,
+    Op.NN_DENSE: TorchNNDenseBench
 }
--- a/src/tf_2/add.py
+++ b/src/tf_2/add.py
@ -1,4 +1,5 @@
 from pathlib import Path
+from typing import List, Tuple

 import tensorflow as tf

@ -7,28 +8,21 @@ from src.tf_2.base import TFBase


 class TFAddBench(TFBase):
-    def __init__(self, output_path: Path):
-        super().__init__(output_path, Op.ADD)
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.ADD, data_type)
+        self.tensor_1: tf.Tensor = None
+        self.tensor_2: tf.Tensor = None
+        self.tensor_result: tf.Tensor = None

-    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: tf.DType, device: tf.device):
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1 = experiment_args
-        with device:
-            tensor_1 = tf.ones(shape_1, dtype=dtype)
-            tensor_2 = tf.ones(shape_1, dtype=dtype)
+        with self.device:
+            self.tensor_1 = tf.ones(shape_1, dtype=self.dtype)
+            self.tensor_2 = tf.ones(shape_1, dtype=self.dtype)
+        self.tensor_result = self.tensor_1 + self.tensor_2

-            for _ in range(length):
-                _ = tensor_1 + tensor_2
+    def experiment(self):
+        self.tensor_result = self.tensor_1 + self.tensor_2

-    def name(self, experiment_args: tuple[int, int]) -> str:
-        shape_1 = experiment_args
-        return f'{shape_1[0]}x{shape_1[1]} + {shape_1[0]}x{shape_1[1]}'
-
-    def mop(self, experiment_args: tuple[int, int]) -> float:
-        shape_1 = experiment_args
-        return shape_1[0] * shape_1[1] / 1000_000
-
-    def run(self,
-            experiment_args: list[tuple[int, int]],
-            experiment_count: int,
-            data_type: DataType):
-        super().run(experiment_args, experiment_count, data_type)
+    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
+        super().run(experiment_args, experiment_count)
--- a/src/tf_2/base.py
+++ b/src/tf_2/base.py
@ -7,13 +7,13 @@ from src.common import DataType, Device, Op, Platform


 class TFBase(BenchBase):
-    def __init__(self, output_path: Path, bench_op: Op):
+    def __init__(self, output_path: Path, bench_op: Op, data_type: DataType):
        gpus = tf.config.list_physical_devices('GPU')
        if gpus:
            if len(gpus) > 1:
                print('WARINING : no multiple CUDA device benchmark implemented yet (only using first)')

-            tf.config.experimental.set_memory_growth(gpus[0], True)
+            # tf.config.experimental.set_memory_growth(gpus[0], True)
            tf.config.set_visible_devices(gpus[0], 'GPU')
            # logical_gpus = tf.config.list_logical_devices('GPU')
            device_type = Device.GPU
@ -22,22 +22,16 @@ class TFBase(BenchBase):
            device_type = Device.CPU
            device = tf.device('/CPU:0')

-        super().__init__(output_path, Platform.TF2, bench_op, device_type, device)
-
-    def get_dtype(self, data_type: DataType) -> tf.DType:
        if data_type == DataType.FLOAT16:
-            return tf.float16
-        if data_type == DataType.FLOAT32:
-            return tf.float32
-        if data_type == DataType.FLOAT64:
-            return tf.float64
-        raise RuntimeError(f'data_type {data_type.value} not implemented')
+            dtype = tf.float16
+        elif data_type == DataType.FLOAT32:
+            dtype = tf.float32
+        elif data_type == DataType.FLOAT64:
+            dtype = tf.float64
+        else:
+            raise RuntimeError(f'data_type {data_type.value} not implemented')

-    def experiment(self, _experiment_args, _length, _dtype, _device):
-        raise NotImplementedError()
+        super().__init__(output_path, Platform.TF2, bench_op, device_type, device, data_type, dtype)

-    def name(self, _experiment_args) -> str:
-        raise NotImplementedError()
-
-    def mop(self, _experiment_args) -> float:
+    def experiment(self):
        raise NotImplementedError()
--- a/src/tf_2/div.py
+++ b/src/tf_2/div.py
@ -1,4 +1,5 @@
 from pathlib import Path
+from typing import List, Tuple

 import tensorflow as tf

@ -7,28 +8,21 @@ from src.tf_2.base import TFBase


 class TFDivBench(TFBase):
-    def __init__(self, output_path: Path):
-        super().__init__(output_path, Op.DIV)
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.DIV, data_type)
+        self.tensor_1: tf.Tensor = None
+        self.tensor_2: tf.Tensor = None
+        self.tensor_result: tf.Tensor = None

-    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: tf.DType, device: tf.device):
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1 = experiment_args
-        with device:
-            tensor_1 = tf.ones(shape_1, dtype=dtype)
-            tensor_2 = tf.ones(shape_1, dtype=dtype)
+        with self.device:
+            self.tensor_1 = tf.ones(shape_1, dtype=self.dtype)
+            self.tensor_2 = tf.ones(shape_1, dtype=self.dtype)
+        self.tensor_result = self.tensor_1 / self.tensor_2

-            for _ in range(length):
-                _ = tensor_1 / tensor_2
+    def experiment(self):
+        self.tensor_result = self.tensor_1 / self.tensor_2

-    def name(self, experiment_args: tuple[int, int]) -> str:
-        shape_1 = experiment_args
-        return f'{shape_1[0]}x{shape_1[1]} / {shape_1[0]}x{shape_1[1]}'
-
-    def mop(self, experiment_args: tuple[int, int]) -> float:
-        shape_1 = experiment_args
-        return shape_1[0] * shape_1[1] / 1000_000
-
-    def run(self,
-            experiment_args: list[tuple[int, int]],
-            experiment_count: int,
-            data_type: DataType):
-        super().run(experiment_args, experiment_count, data_type)
+    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
+        super().run(experiment_args, experiment_count)
--- a/src/tf_2/matmul.py
+++ b/src/tf_2/matmul.py
@ -1,4 +1,5 @@
 from pathlib import Path
+from typing import List, Tuple

 import tensorflow as tf

@ -7,28 +8,21 @@ from src.tf_2.base import TFBase


 class TFMatmulBench(TFBase):
-    def __init__(self, output_path: Path):
-        super().__init__(output_path, Op.MATMUL)
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.MATMUL, data_type)
+        self.tensor_1: tf.Tensor = None
+        self.tensor_2: tf.Tensor = None
+        self.tensor_result: tf.Tensor = None

-    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: tf.DType, device: tf.device):
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1, shape_2 = experiment_args
-        with device:
-            tensor_1 = tf.ones(shape_1, dtype=dtype)
-            tensor_2 = tf.ones(shape_2, dtype=dtype)
+        with self.device:
+            self.tensor_1 = tf.ones(shape_1, dtype=self.dtype)
+            self.tensor_2 = tf.ones(shape_2, dtype=self.dtype)
+        self.tensor_result = self.tensor_1 @ self.tensor_2

-            for _ in range(length):
-                _ = tensor_1 @ tensor_2
+    def experiment(self):
+        self.tensor_result = self.tensor_1 @ self.tensor_2

-    def name(self, experiment_args: tuple[int, int]) -> str:
-        shape_1, shape_2 = experiment_args
-        return f'{shape_1[0]}x{shape_1[1]} @ {shape_2[0]}x{shape_2[1]}'
-
-    def mop(self, experiment_args: tuple[int, int]) -> float:
-        shape_1, shape_2 = experiment_args
-        return (shape_1[0] * shape_2[1] / 1000_000) * 2 * (shape_1[1] - 1)
-
-    def run(self,
-            experiment_args: list[tuple[tuple[int, int], tuple[int, int]]],
-            experiment_count: int,
-            data_type: DataType):
-        super().run(experiment_args, experiment_count, data_type)
+    def run(self, experiment_args: List[Tuple[Tuple[int, int], Tuple[int, int]]], experiment_count: int):
+        super().run(experiment_args, experiment_count)
--- a/src/tf_2/mul.py
+++ b/src/tf_2/mul.py
@ -1,4 +1,5 @@
 from pathlib import Path
+from typing import List, Tuple

 import tensorflow as tf

@ -7,28 +8,21 @@ from src.tf_2.base import TFBase


 class TFMulBench(TFBase):
-    def __init__(self, output_path: Path):
-        super().__init__(output_path, Op.MUL)
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.MUL, data_type)
+        self.tensor_1: tf.Tensor = None
+        self.tensor_2: tf.Tensor = None
+        self.tensor_result: tf.Tensor = None

-    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: tf.DType, device: tf.device):
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1 = experiment_args
-        with device:
-            tensor_1 = tf.ones(shape_1, dtype=dtype)
-            tensor_2 = tf.ones(shape_1, dtype=dtype)
+        with self.device:
+            self.tensor_1 = tf.ones(shape_1, dtype=self.dtype)
+            self.tensor_2 = tf.ones(shape_1, dtype=self.dtype)
+        self.tensor_result = self.tensor_1 * self.tensor_2

-            for _ in range(length):
-                _ = tensor_1 * tensor_2
+    def experiment(self):
+        self.tensor_result = self.tensor_1 * self.tensor_2

-    def name(self, experiment_args: tuple[int, int]) -> str:
-        shape_1 = experiment_args
-        return f'{shape_1[0]}x{shape_1[1]} * {shape_1[0]}x{shape_1[1]}'
-
-    def mop(self, experiment_args: tuple[int, int]) -> float:
-        shape_1 = experiment_args
-        return shape_1[0] * shape_1[1] / 1000_000
-
-    def run(self,
-            experiment_args: list[tuple[int, int]],
-            experiment_count: int,
-            data_type: DataType):
-        super().run(experiment_args, experiment_count, data_type)
+    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
+        super().run(experiment_args, experiment_count)
--- a/src/tf_2/nn_dense.py
+++ b/src/tf_2/nn_dense.py
@ -0,0 +1,35 @@
+from pathlib import Path
+from typing import List, Tuple
+
+import tensorflow as tf
+
+from src.common import DataType, Op
+from src.tf_2.base import TFBase
+
+
+class DenseModel(tf.keras.Model):
+    def __init__(self, input_dim: int, dtype=tf.DType):
+        super().__init__()
+        self.dense = tf.keras.layers.Dense(input_dim, dtype=dtype)
+
+    def call(self, input_tensor: tf.Tensor) -> tf.Tensor:
+        return self.dense(input_tensor)
+
+
+class TFNNDenseBench(TFBase):
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.NN_DENSE, data_type)
+        self.tensor: tf.Tensor = None
+        self.network: tf.keras.Model = None
+
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
+        batch_size, dimension = experiment_args
+        with self.device:
+            self.tensor = tf.ones((batch_size, dimension), dtype=self.dtype)
+            self.network = DenseModel(dimension, self.dtype)
+
+    def experiment(self):
+        self.network(self.tensor)
+
+    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
+        super().run(experiment_args, experiment_count)
--- a/src/tf_2/nn_matmul.py
+++ b/src/tf_2/nn_matmul.py
@ -0,0 +1,34 @@
+from pathlib import Path
+from typing import List, Tuple
+
+import tensorflow as tf
+
+from src.common import DataType, Op
+from src.tf_2.base import TFBase
+
+
+class MatmulModel(tf.keras.Model):
+    def call(self, tensor_1: tf.Tensor, tensor_2: tf.Tensor) -> tf.Tensor:
+        return tf.matmul(tensor_1, tensor_2)
+
+
+class TFNNMatmulBench(TFBase):
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.NN_MATMUL, data_type)
+        self.tensor_1: tf.Tensor = None
+        self.tensor_2: tf.Tensor = None
+        self.tensor_result: tf.Tensor = None
+        self.network: tf.keras.Model = None
+
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
+        shape_1, shape_2 = experiment_args
+        with self.device:
+            self.tensor_1 = tf.ones(shape_1, dtype=self.dtype)
+            self.tensor_2 = tf.ones(shape_2, dtype=self.dtype)
+            self.network = MatmulModel()
+
+    def experiment(self):
+        self.tensor_result = self.network(self.tensor_1, self.tensor_2)
+
+    def run(self, experiment_args: List[Tuple[Tuple[int, int], Tuple[int, int]]], experiment_count: int):
+        super().run(experiment_args, experiment_count)
--- a/src/tf_2/ops.py
+++ b/src/tf_2/ops.py
@ -6,11 +6,15 @@ from src.tf_2.base import TFBase
 from src.tf_2.div import TFDivBench
 from src.tf_2.mul import TFMulBench
 from src.tf_2.matmul import TFMatmulBench
+from src.tf_2.nn_dense import TFNNDenseBench
+from src.tf_2.nn_matmul import TFNNMatmulBench


 tf2_ops: dict[Op, Type[TFBase]] = {
    Op.ADD: TFAddBench,
    Op.MUL: TFMulBench,
    Op.DIV: TFDivBench,
-    Op.MATMUL: TFMatmulBench
+    Op.MATMUL: TFMatmulBench,
+    Op.NN_MATMUL: TFNNMatmulBench,
+    Op.NN_DENSE: TFNNDenseBench
 }
--- a/src/tf_2_v1/add.py
+++ b/src/tf_2_v1/add.py
@ -0,0 +1,30 @@
+from pathlib import Path
+from typing import List, Tuple
+
+import tensorflow.compat.v1 as tf
+
+from src.common import DataType, Op
+from src.tf_2_v1.base import TFBase
+
+
+class TFAddBench(TFBase):
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.ADD, data_type)
+        self.add_op = None
+
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
+        super().pre_experiment(experiment_args)
+        shape_1 = experiment_args
+        tensor_1 = tf.get_variable('tensor_1', shape=shape_1, dtype=self.dtype,
+                                   initializer=tf.initializers.ones, trainable=False)
+        tensor_2 = tf.get_variable('tensor_2', shape=shape_1, dtype=self.dtype,
+                                   initializer=tf.initializers.ones, trainable=False)
+        self.add_op = tensor_1 + tensor_2
+
+        self.session.run(tf.initializers.global_variables())
+
+    def experiment(self):
+        self.session.run(self.add_op)
+
+    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
+        super().run(experiment_args, experiment_count)
--- a/src/tf_2_v1/base.py
+++ b/src/tf_2_v1/base.py
@ -0,0 +1,43 @@
+from pathlib import Path
+
+import tensorflow.compat.v1 as tf
+
+from src.base import BenchBase
+from src.common import DataType, Device, Op, Platform
+
+
+class TFBase(BenchBase):
+    def __init__(self, output_path: Path, bench_op: Op, data_type: DataType):
+        if data_type == DataType.FLOAT16:
+            dtype = tf.float16
+        elif data_type == DataType.FLOAT32:
+            dtype = tf.float32
+        elif data_type == DataType.FLOAT64:
+            dtype = tf.float64
+        else:
+            raise RuntimeError(f'data_type {data_type.value} not implemented')
+
+        super().__init__(output_path, Platform.TF2_V1, bench_op, Device.GPU, None, data_type, dtype)
+        self.session: tf.Session = None
+
+    def pre_experiment(self, _experiment_args):
+        tf.disable_v2_behavior()
+        # tf.disable_eager_execution()
+        # gpu_options = tf.GPUOptions(allow_growth=True)
+        # session_config = tf.ConfigProto(gpu_options=gpu_options)
+        # self.session = tf.Session(config=session_config)
+        self.session = tf.Session()
+        self.session.as_default()
+
+    def post_experiment(self):
+        self.session.close()
+        tf.reset_default_graph()
+
+    def experiment(self):
+        raise NotImplementedError()
+
+    def name(self, _experiment_args) -> str:
+        raise NotImplementedError()
+
+    def mop(self, _experiment_args) -> float:
+        raise NotImplementedError()
--- a/src/tf_2_v1/div.py
+++ b/src/tf_2_v1/div.py
@ -0,0 +1,30 @@
+from pathlib import Path
+from typing import List, Tuple
+
+import tensorflow.compat.v1 as tf
+
+from src.common import DataType, Op
+from src.tf_2_v1.base import TFBase
+
+
+class TFDivBench(TFBase):
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.DIV, data_type)
+        self.div_op = None
+
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
+        super().pre_experiment(experiment_args)
+        shape_1 = experiment_args
+        tensor_1 = tf.get_variable('tensor_1', shape=shape_1, dtype=self.dtype,
+                                   initializer=tf.initializers.ones, trainable=False)
+        tensor_2 = tf.get_variable('tensor_2', shape=shape_1, dtype=self.dtype,
+                                   initializer=tf.initializers.ones, trainable=False)
+        self.div_op = tensor_1 / tensor_2
+
+        self.session.run(tf.initializers.global_variables())
+
+    def experiment(self):
+        self.session.run(self.div_op)
+
+    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
+        super().run(experiment_args, experiment_count)
--- a/src/tf_2_v1/matmul.py
+++ b/src/tf_2_v1/matmul.py
@ -0,0 +1,30 @@
+from pathlib import Path
+from typing import List, Tuple
+
+import tensorflow.compat.v1 as tf
+
+from src.common import DataType, Op
+from src.tf_2_v1.base import TFBase
+
+
+class TFMatmulBench(TFBase):
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.MATMUL, data_type)
+        self.matmul_op = None
+
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
+        super().pre_experiment(experiment_args)
+        shape_1, shape_2 = experiment_args
+        tensor_1 = tf.get_variable('tensor_1', shape=shape_1, dtype=self.dtype,
+                                   initializer=tf.initializers.ones, trainable=False)
+        tensor_2 = tf.get_variable('tensor_2', shape=shape_2, dtype=self.dtype,
+                                   initializer=tf.initializers.ones, trainable=False)
+        self.matmul_op = tensor_1 @ tensor_2
+
+        self.session.run(tf.initializers.global_variables())
+
+    def experiment(self):
+        self.session.run(self.matmul_op)
+
+    def run(self, experiment_args: List[Tuple[Tuple[int, int], Tuple[int, int]]], experiment_count: int):
+        super().run(experiment_args, experiment_count)
--- a/src/tf_2_v1/mul.py
+++ b/src/tf_2_v1/mul.py
@ -0,0 +1,30 @@
+from pathlib import Path
+from typing import List, Tuple
+
+import tensorflow.compat.v1 as tf
+
+from src.common import DataType, Op
+from src.tf_2_v1.base import TFBase
+
+
+class TFMulBench(TFBase):
+    def __init__(self, output_path: Path, data_type: DataType):
+        super().__init__(output_path, Op.MUL, data_type)
+        self.mul_op = None
+
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
+        super().pre_experiment(experiment_args)
+        shape_1 = experiment_args
+        tensor_1 = tf.get_variable('tensor_1', shape=shape_1, dtype=self.dtype,
+                                   initializer=tf.initializers.ones, trainable=False)
+        tensor_2 = tf.get_variable('tensor_2', shape=shape_1, dtype=self.dtype,
+                                   initializer=tf.initializers.ones, trainable=False)
+        self.mul_op = tensor_1 * tensor_2
+
+        self.session.run(tf.initializers.global_variables())
+
+    def experiment(self):
+        self.session.run(self.mul_op)
+
+    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
+        super().run(experiment_args, experiment_count)
--- a/src/tf_2_v1/ops.py
+++ b/src/tf_2_v1/ops.py
@ -0,0 +1,16 @@
+from typing import Type
+
+from src.common import Op
+from src.tf_2_v1.add import TFAddBench
+from src.tf_2_v1.base import TFBase
+from src.tf_2_v1.div import TFDivBench
+from src.tf_2_v1.mul import TFMulBench
+from src.tf_2_v1.matmul import TFMatmulBench
+
+
+tf2v1_ops: dict[Op, Type[TFBase]] = {
+    Op.ADD: TFAddBench,
+    Op.MUL: TFMulBench,
+    Op.DIV: TFDivBench,
+    Op.MATMUL: TFMatmulBench
+}