Jax implementation, code factorisation

* Compatibility for older python version (typing)
2021-10-01 20:14:00 +09:00 · 2021-10-01 20:14:00 +09:00 · 16b7239cd7
commit 16b7239cd7
parent 4b2bcfe7e8
37 changed files with 1007 additions and 293 deletions
--- a/benchmark.py
+++ b/benchmark.py
@ -2,28 +2,46 @@ from argparse import ArgumentParser
 import multiprocessing as mp
 import os
 from pathlib import Path
-from typing import Type
+import sys
 from typing import List, Type
 from config.benchmark import Config
 from src.base import BenchBase
 from src.common import DataType, Op, Platform
 from src.plot import compare
 def run_benchmark(output_path: Path, platform: Platform, data_type: DataType, bench_op: Op,
                  bench_args, bench_count: int):
-    if platform == Platform.TF2:
+    if platform == Platform.JAX:
        from src.jax.ops import jax_ops
        if bench_op not in jax_ops:
            print(f'Operation {bench_op.value} is not implemented for {platform.value} yet')
        else:
            jax_ops[bench_op](output_path, data_type).run(bench_args, bench_count)
            print()
    elif platform == Platform.TF2:
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
        from src.tf_2.ops import tf2_ops
        if bench_op not in tf2_ops:
            print(f'Operation {bench_op.value} is not implemented for {platform.value} yet')
        else:
-            tf2_ops[bench_op](output_path).run(bench_args, bench_count, data_type)
+            tf2_ops[bench_op](output_path, data_type).run(bench_args, bench_count)
            print()
    elif platform == Platform.TF2_V1:
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
        from src.tf_2_v1.ops import tf2v1_ops
        if bench_op not in tf2v1_ops:
            print(f'Operation {bench_op.value} is not implemented for {platform.value} yet')
        else:
            tf2v1_ops[bench_op](output_path, data_type).run(bench_args, bench_count)
            print()
    elif platform == Platform.TORCH:
        from src.pytorch.ops import torch_ops
        if bench_op not in torch_ops:
            print(f'Operation {bench_op.value} is not implemented for {platform.value} yet')
        else:
-            torch_ops[bench_op](output_path).run(bench_args, bench_count, data_type)
+            torch_ops[bench_op](output_path, data_type).run(bench_args, bench_count)
            print()
    else:
        print(f'Platform {platform.value} is not implemented yet')
@ -32,6 +50,8 @@ def run_benchmark(output_path: Path, platform: Platform, data_type: DataType, be
 def main():
    parser = ArgumentParser()
    parser.add_argument('--output', type=Path, default=Path('output'), help='Path to output files')
    parser.add_argument('--no-benchmark', action='store_true', default=False, help='Avoid running benchmarks')
    parser.add_argument('--no-compare', action='store_true', default=False, help='Avoid running platform comparaison')
    parser.add_argument('--count', type=int, default=30,
                        help='Number of experiments per benchmark (for stastistical analysis)')
    parser.add_argument('--platform', nargs='*', type=Platform,
@ -39,63 +59,57 @@ def main():
    parser.add_argument('--data', nargs='*', type=DataType,
                        help='List of data type to benchmark [float16, float32, float64] (else all are used)')
    parser.add_argument('--op', nargs='*', type=Op,
-                        help='List of operation to benchmark [add, mul, div, matmul] (else all are used)')
+                        help='List of operation to benchmark (add, mul, div, matmul, etc) (else all are used)')
    parser.add_argument('--list-op', action='store_true',
                        help='List all possible operation to benchmark (no further action will be done)')
    parser.add_argument(
        '--experiment-time', type=float,
        help=f'Change time (in s) per experiment (default={Config.EXPERIMENT_TIME:0.3f}s)')
    arguments = parser.parse_args()
    if arguments.list_op:
        print(', '.join([op.value for op in Op]))
        sys.exit(0)
    output_path: Path = arguments.output
    no_benchmark: bool = arguments.no_benchmark
    no_compare: bool = arguments.no_compare
    bench_count: int = arguments.count
-    platforms: list[Platform] = arguments.platform if arguments.platform is not None else list(Platform)
+    platforms: List[Platform] = arguments.platform if arguments.platform is not None else list(Platform)
-    data: list[DataType] = arguments.data if arguments.data is not None else list(DataType)
+    data: List[DataType] = arguments.data if arguments.data is not None else list(DataType)
-    bench_ops: list[Op] = arguments.op if arguments.op is not None else list(Op)
+    bench_ops: List[Op] = arguments.op if arguments.op is not None else list(Op)
    if arguments.experiment_time:
        Config.EXPERIMENT_TIME = arguments.experiment_time
    if not output_path.exists():
        output_path.mkdir(parents=True)
-    benchmarks: list[dict[Op, Type[BenchBase]]] = []
+    if not no_benchmark:
-    element_wise_args = [
+        benchmarks: List[dict[Op, Type[BenchBase]]] = []
        (100, 100),
        (100, 200),
        (128, 128),
        (200, 100),
        (200, 200),
        (256, 256),
        (256, 512),
        (512, 256),
        (400, 400),
        (512, 512),
        (800, 800),
        (1024, 1024),
        (1800, 1800)]
    matmul_args = [
        ((100, 100), (100, 100)),
        ((100, 200), (200, 100)),
        ((128, 128), (128, 128)),
        ((200, 100), (100, 200)),
        ((200, 200), (200, 200)),
        ((256, 256), (256, 256)),
        ((256, 512), (512, 256)),
        ((400, 400), (400, 400)),
        ((512, 256), (256, 512)),
        ((512, 512), (512, 512)),
        ((800, 800), (800, 800)),
        ((1000, 1000), (1000, 1000)),
        ((1200, 1200), (1200, 1200))]
        for platform in platforms:
            for data_type in data:
                for bench_op in [Op.ADD, Op.MUL, Op.DIV]:
                    if bench_op in bench_ops:
-                    benchmarks.append((output_path, platform, data_type, bench_op, element_wise_args, bench_count))
+                        benchmarks.append((output_path, platform, data_type, bench_op,
-            if Op.MATMUL in bench_ops:
+                                           Config.ELEMENT_WISE_ARGS, bench_count))
-                benchmarks.append((output_path, platform, data_type, Op.MATMUL, matmul_args, bench_count))
+                for bench_op in [Op.MATMUL, Op.NN_MATMUL]:
                    if bench_op in bench_ops:
                        benchmarks.append((output_path, platform, data_type, bench_op, Config.MATMUL_ARGS, bench_count))
                if Op.NN_DENSE in bench_ops:
                    benchmarks.append((output_path, platform, data_type, Op.NN_DENSE, Config.NN_1D_ARGS, bench_count))
        if benchmarks:
            for benchmark in benchmarks:
                process = mp.Process(target=run_benchmark, args=benchmark)
                process.start()
                process.join()
        print('Benchmark done')
    if not no_compare:
        compare(output_path)
        print('Compare done')
 if __name__ == '__main__':
    main()
--- a/config/benchmark.py
+++ b/config/benchmark.py
@ -0,0 +1,41 @@
 class Config:
    EXPERIMENT_TIME = 1.0
    ELEMENT_WISE_ARGS = [
        (100, 100),
        (100, 200),
        (128, 128),
        (200, 100),
        (200, 200),
        (256, 256),
        (256, 512),
        (512, 256),
        (400, 400),
        (512, 512),
        (800, 800),
        (1024, 1024),
        (1800, 1800)]
    MATMUL_ARGS = [
        ((100, 100), (100, 100)),
        ((100, 200), (200, 100)),
        ((128, 128), (128, 128)),
        ((200, 100), (100, 200)),
        ((200, 200), (200, 200)),
        ((256, 256), (256, 256)),
        ((256, 512), (512, 256)),
        ((400, 400), (400, 400)),
        ((512, 256), (256, 512)),
        ((512, 512), (512, 512)),
        ((800, 800), (800, 800)),
        ((1000, 1000), (1000, 1000)),
        ((1200, 1200), (1200, 1200))]
    NN_1D_ARGS = [
        (1, 16), (16, 16), (64, 16),
        (1, 64), (16, 64),
        (1, 150), (16, 150),
        (1, 256), (16, 256),
        (1, 400), (16, 400), (64, 400),
        (1, 512), (16, 512), (64, 512),
        (1, 800), (16, 800), (64, 800),
        (1, 1024), (16, 1024),
        (1, 2000), (16, 2000), (64, 2000),
        (1, 4000), (16, 4000), (64, 4000)]
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,4 @@
 matplotlib
 seaborn
 tensorflow
 torch
--- a/src/base.py
+++ b/src/base.py
@ -1,69 +1,71 @@
 from pathlib import Path
 import time
 from typing import List
 import numpy as np
 import pandas as pd
 from config.benchmark import Config
 from src.common import DataKey, DataType, Device, Op, Platform
-from src.plot import plot_experiments
+from src.op_info import op_infos
 from src.utils import get_cpu_name, get_nvidia_name
 class BenchBase():
-    def __init__(self, output_path: Path, platform: Platform, bench_op: Op, device_type: Device, device):
+    def __init__(self, output_path: Path, platform: Platform, bench_op: Op,
                 device_type: Device, device,
                 data_type: DataType, dtype):
        self._base_output_path = output_path
        self.output_path = output_path
        self.platform = platform
        self.bench_op = bench_op
        self.device_type = device_type
        self.device = device
-        self.dtype = None
+        self.device_name = get_cpu_name() if self.device_type == Device.CPU else get_nvidia_name()
        self.data_type = data_type
        self.dtype = dtype
        self.info = op_infos[bench_op]
    def set_output_path(self, device: Device, device_name: str):
        self.output_path = (
-            self._base_output_path / f'{device.value}_{device_name}' / self.platform.value / self.bench_op.value)
+            self._base_output_path / f'{self.device_type.value}_{self.device_name}'
            / self.platform.value / self.bench_op.value)  # noqa
-    def get_dtype(self, data_type: DataType):
+    def pre_experiment(self, _experiment_args):
        pass
    def experiment(self):
        raise NotImplementedError()
-    def experiment(self, _experiment_args, _length, _dtype, _device):
+    def post_experiment(self):
-        raise NotImplementedError()
+        pass
    def name(self, _experiment_args) -> str:
        raise NotImplementedError()
    def mop(self, _experiment_args) -> float:
        raise NotImplementedError()
    def run(self, experiment_args, experiment_count: int, data_type: DataType):
        self.set_output_path(self.device_type, get_cpu_name() if self.device_type == Device.CPU else get_nvidia_name())
    def run(self, experiment_args, experiment_count: int):
        if not self.output_path.exists():
            self.output_path.mkdir(parents=True)
-        dtype = self.get_dtype(data_type)
+        print(f'Starting {self.platform.value}\'s {self.bench_op.value} benchmark'
-
+              f' with data type: {self.data_type.value}')
        print(f'Starting {self.platform.value}\'s {self.bench_op.value} benchmark with data type: {data_type.value}')
        experiment_names = []
        experiment_lengths = []
        experiment_times = []
        experiment_mop = []
        for args in experiment_args:
            self.pre_experiment(args)
            # warmup
-            for _ in range(4):
+            for _ in range(20):
-                self.experiment(args, 5, dtype, self.device)
+                self.experiment()
            # speed evalutaion
            counter = 0
            start_time = time.time()
-            while time.time() - start_time < 0.2:
+            while (time.time() - start_time) < (Config.EXPERIMENT_TIME / 5):
-                self.experiment(args, 10, dtype, self.device)
+                self.experiment()
-                counter += 10
+                counter += 1
            end_time = time.time()
-            target_time = 1.0  # in s
+            target_time = Config.EXPERIMENT_TIME  # in s
            experiment_speed = counter / (end_time - start_time)  # in op/s
            experiment_length = max(int(target_time / experiment_count * experiment_speed), 2)
            # print(f'Evaluated {counter} {self.bench_op.value} in {end_time - start_time:0.3f}s'
@ -73,24 +75,28 @@ class BenchBase():
            run_times = []
            for _ in range(experiment_count):
                start_time = time.time()
-                self.experiment(args, experiment_length, dtype, self.device)
+                for _ in range(experiment_length):
                    self.experiment()
                run_times.append(time.time() - start_time)
            experiment_times += run_times
-            experiment_names += [self.name(args)] * experiment_count
+            experiment_names += [self.info.name(args)] * experiment_count
            experiment_lengths += [experiment_length] * experiment_count
-            experiment_mop += [self.mop(args)] * experiment_count
+            experiment_mop += [self.info.mop(args)] * experiment_count
            total_time = np.array(run_times, dtype=np.float64).sum()
-            total_glop = self.mop(args) * experiment_length * experiment_count / 1000
+            total_glop = self.info.mop(args) * experiment_length * experiment_count / 1000
            print(f'Run {experiment_names[-1]} (x{experiment_length})'
                  f' in {total_time:0.2f}s => {total_glop / total_time:0.3f}GFOPS')
            self.post_experiment()
-        data = self.save_experiments(experiment_names, experiment_times, experiment_lengths, experiment_mop, data_type)
+        data = self.save_experiments(experiment_names, experiment_times, experiment_lengths, experiment_mop)
-        plot_experiments(self.output_path, data, data_type, self.bench_op, self.platform)
+        # Avoid circular import
        from src.plot import plot_experiments  # pylint: disable=import-outside-toplevel
        plot_experiments(self, data)
    def save_experiments(
-            self, experiment_names: list[str], experiment_times: list[float],
+            self, experiment_names: List[str], experiment_times: List[float],
-            experiment_lengths: list[int], experiment_mop: list[float], data_type: DataType) -> pd.DataFrame:
+            experiment_lengths: List[int], experiment_mop: List[float]) -> pd.DataFrame:
        key = DataKey(self.bench_op)
        data = pd.DataFrame(
            {
@ -102,5 +108,5 @@ class BenchBase():
                key.gflops: [(mop * l) / (t * 1000.0)
                             for mop, l, t in zip(experiment_mop, experiment_lengths, experiment_times)]
            })
-        data.to_csv(self.output_path / f'{self.bench_op.value}_{data_type.value}.csv', sep='\t')
+        data.to_csv(self.output_path / f'{self.bench_op.value}_{self.data_type.value}.csv', sep='\t')
        return data
--- a/src/common.py
+++ b/src/common.py
@ -13,16 +13,19 @@ class DataType(Enum):
 class Op(Enum):
    NO_OP = 'noop'
    ADD = 'add'
    DIV = 'div'
    MUL = 'mul'
    MATMUL = 'matmul'
    NN_MATMUL = 'nn_matmul'
    NN_DENSE = 'nn_dense'
 class Platform(Enum):
-    TF1 = 'TF1'
+    JAX = 'jax'
    # TF1 = 'TF1'
    TF2 = 'TF2'
    TF2_V1 = 'TF2_V1'
    TORCH = 'Torch'
--- a/src/jax/add.py
+++ b/src/jax/add.py
@ -0,0 +1,25 @@
 from pathlib import Path
 from typing import Tuple
 from jax import device_put
 import jax.numpy as jnp
 from src.common import DataType, Op
 from src.jax.base import JaxBase
 class JaxAddBench(JaxBase):
    def __init__(self, output_path: Path, data_type: DataType):
        super().__init__(output_path, Op.ADD, data_type)
        self.tensor_1: jnp.DeviceArray = None
        self.tensor_2: jnp.DeviceArray = None
        self.tensor_result: jnp.DeviceArray = None
    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1 = experiment_args
        self.tensor_1 = device_put(jnp.ones(shape_1, dtype=self.dtype))
        self.tensor_2 = device_put(jnp.ones(shape_1, dtype=self.dtype))
        self.tensor_result = jnp.add(self.tensor_1, self.tensor_2).block_until_ready()
    def experiment(self):
        self.tensor_result = jnp.add(self.tensor_1, self.tensor_2).block_until_ready()
--- a/src/jax/base.py
+++ b/src/jax/base.py
@ -0,0 +1,34 @@
 from pathlib import Path
 import jax.numpy as jnp
 import jax
 from src.base import BenchBase
 from src.common import DataType, Device, Op, Platform
 class JaxBase(BenchBase):
    def __init__(self, output_path: Path, bench_op: Op, data_type: DataType):
        gpu_devices = jax.devices('gpu')
        if gpu_devices:
            if len(gpu_devices) > 1:
                print('WARINING : no multiple CUDA device benchmark implemented yet (only using first)')
            device_type = Device.GPU
            device = gpu_devices[0]
        else:
            device_type = Device.CPU
            device = jax.devices('cpu')[0]
        if data_type == DataType.FLOAT16:
            dtype = jnp.float16
        elif data_type == DataType.FLOAT32:
            dtype = jnp.float32
        elif data_type == DataType.FLOAT64:
            dtype = jnp.float64
        else:
            raise NotImplementedError(f'data_type {data_type.value} not implemented')
        super().__init__(output_path, Platform.JAX, bench_op, device_type, device, data_type, dtype)
    def experiment(self):
        raise NotImplementedError()
--- a/src/jax/div.py
+++ b/src/jax/div.py
@ -0,0 +1,25 @@
 from pathlib import Path
 from typing import Tuple
 from jax import device_put
 import jax.numpy as jnp
 from src.common import DataType, Op
 from src.jax.base import JaxBase
 class JaxDivBench(JaxBase):
    def __init__(self, output_path: Path, data_type: DataType):
        super().__init__(output_path, Op.DIV, data_type)
        self.tensor_1: jnp.DeviceArray = None
        self.tensor_2: jnp.DeviceArray = None
        self.tensor_result: jnp.DeviceArray = None
    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1 = experiment_args
        self.tensor_1 = device_put(jnp.ones(shape_1, dtype=self.dtype))
        self.tensor_2 = device_put(jnp.ones(shape_1, dtype=self.dtype))
        self.tensor_result = jnp.divide(self.tensor_1, self.tensor_2).block_until_ready()
    def experiment(self):
        self.tensor_result = jnp.divide(self.tensor_1, self.tensor_2).block_until_ready()
--- a/src/jax/matmul.py
+++ b/src/jax/matmul.py
@ -0,0 +1,28 @@
 from pathlib import Path
 from typing import List, Tuple
 from jax import device_put
 import jax.numpy as jnp
 from src.common import DataType, Op
 from src.jax.base import JaxBase
 class JaxMatmulBench(JaxBase):
    def __init__(self, output_path: Path, data_type: DataType):
        super().__init__(output_path, Op.MATMUL, data_type)
        self.tensor_1: jnp.DeviceArray = None
        self.tensor_2: jnp.DeviceArray = None
        self.tensor_result: jnp.DeviceArray = None
    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1, shape_2 = experiment_args
        self.tensor_1 = device_put(jnp.ones(shape_1, dtype=self.dtype))
        self.tensor_2 = device_put(jnp.ones(shape_2, dtype=self.dtype))
        self.tensor_result = jnp.matmul(self.tensor_1, self.tensor_2).block_until_ready()
    def experiment(self):
        self.tensor_result = jnp.matmul(self.tensor_1, self.tensor_2).block_until_ready()
    def run(self, experiment_args: List[Tuple[Tuple[int, int], Tuple[int, int]]], experiment_count: int):
        super().run(experiment_args, experiment_count)
--- a/src/jax/mul.py
+++ b/src/jax/mul.py
@ -0,0 +1,25 @@
 from pathlib import Path
 from typing import Tuple
 from jax import device_put
 import jax.numpy as jnp
 from src.common import DataType, Op
 from src.jax.base import JaxBase
 class JaxMulBench(JaxBase):
    def __init__(self, output_path: Path, data_type: DataType):
        super().__init__(output_path, Op.MUL, data_type)
        self.tensor_1: jnp.DeviceArray = None
        self.tensor_2: jnp.DeviceArray = None
        self.tensor_result: jnp.DeviceArray = None
    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1 = experiment_args
        self.tensor_1 = device_put(jnp.ones(shape_1, dtype=self.dtype))
        self.tensor_2 = device_put(jnp.ones(shape_1, dtype=self.dtype))
        self.tensor_result = jnp.multiply(self.tensor_1, self.tensor_2).block_until_ready()
    def experiment(self):
        self.tensor_result = jnp.multiply(self.tensor_1, self.tensor_2).block_until_ready()
--- a/src/jax/nn_dense.py
+++ b/src/jax/nn_dense.py
@ -0,0 +1,32 @@
 from pathlib import Path
 from typing import Callable, List, Tuple
 from jax import device_put, jit, random
 from jax.experimental import stax
 import jax.numpy as jnp
 from src.common import DataType, Op
 from src.jax.base import JaxBase
 class JaxNNDenseBench(JaxBase):
    def __init__(self, output_path: Path, data_type: DataType):
        super().__init__(output_path, Op.NN_DENSE, data_type)
        self.tensor: jnp.DeviceArray = None
        self.tensor_result: jnp.DeviceArray = None
        self.network: Callable = None
        self.params = None
    def pre_experiment(self, experiment_args: Tuple[int, int]):
        batch_size, dimension = experiment_args
        self.tensor = device_put(jnp.ones((batch_size, dimension), dtype=self.dtype))
        network_init, self.network = stax.Dense(dimension)
        _, self.params = network_init(random.PRNGKey(1), (batch_size, dimension))
        self.network = jit(self.network)
        self.tensor_result = self.network(self.params, self.tensor)
    def experiment(self):
        self.tensor_result = self.network(self.params, self.tensor)
    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
        super().run(experiment_args, experiment_count)
--- a/src/jax/nn_matmul.py
+++ b/src/jax/nn_matmul.py
@ -0,0 +1,33 @@
 from pathlib import Path
 from typing import List, Tuple
 from jax import device_put, jit
 import jax.numpy as jnp
 from src.common import DataType, Op
 from src.jax.base import JaxBase
 def matmul(tensor_1: jnp.DeviceArray, tensor_2: jnp.DeviceArray) -> jnp.DeviceArray:
    return tensor_1 @ tensor_2
 class JaxNNMatmulBench(JaxBase):
    def __init__(self, output_path: Path, data_type: DataType):
        super().__init__(output_path, Op.NN_MATMUL, data_type)
        self.tensor_1: jnp.DeviceArray = None
        self.tensor_2: jnp.DeviceArray = None
        self.tensor_result: jnp.DeviceArray = None
        self.network = jit(matmul)
    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1, shape_2 = experiment_args
        self.tensor_1 = device_put(jnp.ones(shape_1, dtype=self.dtype))
        self.tensor_2 = device_put(jnp.ones(shape_2, dtype=self.dtype))
        self.tensor_result = self.network(self.tensor_1, self.tensor_2)
    def experiment(self):
        self.tensor_result = self.network(self.tensor_1, self.tensor_2)
    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
        super().run(experiment_args, experiment_count)
--- a/src/jax/ops.py
+++ b/src/jax/ops.py
@ -0,0 +1,20 @@
 from typing import Type
 from src.common import Op
 from src.jax.add import JaxAddBench
 from src.jax.base import JaxBase
 from src.jax.div import JaxDivBench
 from src.jax.mul import JaxMulBench
 from src.jax.matmul import JaxMatmulBench
 from src.jax.nn_dense import JaxNNDenseBench
 from src.jax.nn_matmul import JaxNNMatmulBench
 jax_ops: dict[Op, Type[JaxBase]] = {
    Op.ADD: JaxAddBench,
    Op.MUL: JaxMulBench,
    Op.DIV: JaxDivBench,
    Op.MATMUL: JaxMatmulBench,
    Op.NN_MATMUL: JaxNNMatmulBench,
    Op.NN_DENSE: JaxNNDenseBench
 }
--- a/src/op_info.py
+++ b/src/op_info.py
@ -0,0 +1,85 @@
 from typing import Dict, List, Type, Tuple
 from src.common import Op
 class _BaseInfo():
    @staticmethod
    def name(experiment_args) -> str:
        raise NotImplementedError()
    @staticmethod
    def mop(experiment_args) -> float:
        raise NotImplementedError()
 class AddInfo(_BaseInfo):
    @staticmethod
    def name(experiment_args: Tuple[int, int]) -> str:
        shape_1 = experiment_args
        return f'{shape_1[0]}x{shape_1[1]} + {shape_1[0]}x{shape_1[1]}'
    @staticmethod
    def mop(experiment_args: Tuple[int, int]) -> float:
        shape_1 = experiment_args
        return shape_1[0] * shape_1[1] / 1_000_000
 class DivInfo(_BaseInfo):
    @staticmethod
    def name(experiment_args: Tuple[int, int]) -> str:
        shape_1 = experiment_args
        return f'{shape_1[0]}x{shape_1[1]} / {shape_1[0]}x{shape_1[1]}'
    @staticmethod
    def mop(experiment_args: Tuple[int, int]) -> float:
        shape_1 = experiment_args
        return shape_1[0] * shape_1[1] / 1_000_000
 class MulInfo(_BaseInfo):
    @staticmethod
    def name(experiment_args: Tuple[int, int]) -> str:
        shape_1 = experiment_args
        return f'{shape_1[0]}x{shape_1[1]} * {shape_1[0]}x{shape_1[1]}'
    @staticmethod
    def mop(experiment_args: Tuple[int, int]) -> float:
        shape_1 = experiment_args
        return shape_1[0] * shape_1[1] / 1_000_000
 class MatmulInfo(_BaseInfo):
    @staticmethod
    def name(experiment_args: List[Tuple[Tuple[int, int], Tuple[int, int]]]) -> str:
        shape_1, shape_2 = experiment_args
        return f'{shape_1[0]}x{shape_1[1]} @ {shape_2[0]}x{shape_2[1]}'
    @staticmethod
    def mop(experiment_args: List[Tuple[Tuple[int, int], Tuple[int, int]]]) -> float:
        shape_1, shape_2 = experiment_args
        return (shape_1[0] * shape_2[1] / 1_000_000) * 2 * (shape_1[1] - 1)
 class DenseInfo(_BaseInfo):
    @staticmethod
    def name(experiment_args: Tuple[int, int]) -> str:
        batch_size, dimension = experiment_args
        return f'Dense(({batch_size}x{dimension}))'
    @staticmethod
    def mop(experiment_args: Tuple[int, int]) -> float:
        batch_size, dimension = experiment_args
        return batch_size * (
            ((dimension * dimension / 1_000_000) * 2 * (dimension - 1)) + (
                dimension / 1_000_000))
 op_infos: Dict[Op, Type[_BaseInfo]] = {
    Op.ADD: AddInfo,
    Op.DIV: DivInfo,
    Op.MUL: MulInfo,
    Op.MATMUL: MatmulInfo,
    Op.NN_MATMUL: MatmulInfo,
    Op.NN_DENSE: DenseInfo
 }
--- a/src/plot.py
+++ b/src/plot.py
@ -1,4 +1,7 @@
 from pathlib import Path
 import math
 import multiprocessing as mp
 import os
 import numpy as np
 import matplotlib.pyplot as plt
@ -6,11 +9,20 @@ import pandas as pd
 import seaborn as sns
 from src.base import BenchBase
 from src.common import DataKey, DataType, Op, Platform
-def plot_experiments(output_path: Path, data: pd.DataFrame, data_type: DataType, bench_op: Op, platform: Platform):
+class CompKey:
-    key = DataKey(bench_op)
+    def __init__(self):
        self.data_type = 'data_type'
        self.device = 'device'
        self.bench_op = 'op'
        self.platform = 'platform'
 def plot_experiments(bench: BenchBase, data: pd.DataFrame):
    key = DataKey(bench.bench_op)
    sum_data = data[[key.experiment, key.time, key.count]].groupby(
        key.experiment, as_index=False, sort=False).sum()
    mean_data = data[[key.experiment, key.speed]].groupby(
@ -20,11 +32,11 @@ def plot_experiments(output_path: Path, data: pd.DataFrame, data_type: DataType,
    sns.set_theme(style="ticks")
    figure, axes = plt.subplots(nrows=3, sharex=True, figsize=(18, 12))
    figure.suptitle(f'{platform.value} {bench_op.value} ({data_type.value})', fontsize=16)
    for axe in axes[:-1]:
        axe.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
    chart = sns.barplot(x=key.experiment, y=key.mop, data=max_data, ax=axes[0], order=data[key.experiment].unique())
    if max_data[key.mop].max() > max_data[key.mop].min() * 100:
        axes[0].set_yscale("log")
    for patch, value in zip(chart.patches, max_data[key.mop]):
        chart.annotate(f'{value:0.3f}',
@ -33,6 +45,8 @@ def plot_experiments(output_path: Path, data: pd.DataFrame, data_type: DataType,
                       textcoords='offset points')
    chart = sns.barplot(x=key.experiment, y=key.speed, data=data, estimator=np.median, ax=axes[1])
    if data[key.speed].max() > data[key.speed].min() * 100:
        axes[1].set_yscale("log")
    for patch, value in zip(chart.patches, mean_data[key.speed]):
        chart.annotate(f'{value:.3f}',
                       (patch.get_x() + patch.get_width() / 2.0, patch.get_height()),
@ -40,6 +54,8 @@ def plot_experiments(output_path: Path, data: pd.DataFrame, data_type: DataType,
                       textcoords='offset points')
    chart = sns.barplot(x=key.experiment, y=key.gflops, data=data, estimator=np.median, ax=axes[2])
    if data[key.gflops].max() > data[key.gflops].min() * 100:
        axes[2].set_yscale("log")
    for patch, mop, count, value in zip(chart.patches, max_data[key.mop], sum_data[key.count], sum_data[key.time]):
        chart.annotate(f'{(mop * count / 1000) / value:.3f}',
                       (patch.get_x() + patch.get_width() / 2.0, patch.get_height()),
@ -47,5 +63,62 @@ def plot_experiments(output_path: Path, data: pd.DataFrame, data_type: DataType,
                       textcoords='offset points')
    plt.xticks(rotation=20)
-    plt.subplots_adjust(hspace=0.0, wspace=0.02, top=0.93, right=0.99, bottom=0.1, left=0.05)
+    plt.subplots_adjust(hspace=0.0, wspace=0.02, top=0.91, right=0.99, bottom=0.1, left=0.05)
-    plt.savefig(output_path / f'{bench_op.value}_{data_type.value}.png')
+    figure.suptitle(f'{bench.platform.value} {bench.bench_op.value} ({bench.data_type.value})', fontsize=16)
    axes[0].set_title(f'{bench.device_name}', fontsize=12)
    plt.savefig(bench.output_path / f'{bench.bench_op.value}_{bench.data_type.value}.png')
 def _draw_comparison(all_data: pd.DataFrame, comp_key: CompKey, device: str, bench_op: str, output_path: Path):
    op_data = all_data[(all_data[comp_key.bench_op] == bench_op) & (all_data[comp_key.device] == device)]
    platform_list = op_data[comp_key.platform].unique()
    if len(platform_list) <= 1:
        return
    key = DataKey(Op(bench_op))
    sns.set_theme(style="ticks")
    for data_type in op_data[comp_key.data_type].unique():
        data = op_data[op_data[comp_key.data_type] == data_type]
        graph = sns.catplot(x=key.experiment, y=key.gflops, hue=comp_key.platform, data=data,
                            kind='bar', estimator=np.median, height=8, aspect=1.4)
        if data[key.gflops].max() > data[key.gflops].min() * 100:
            graph.set(yscale="log")
        plt.xticks(rotation=70, fontsize=8)
        plt.subplots_adjust(top=0.92, bottom=0.25)
        plt.suptitle('/'.join(platform_list) + f' {bench_op} ({data_type})', fontsize=16)
        plt.title(f'{device}', fontsize=12)
        plt.savefig(output_path / device / f'{bench_op}_{data_type}.png')
 def compare(output_path: Path):
    all_data: pd.DataFrame = None
    comp_key = CompKey()
    for data_path in output_path.rglob('*.csv'):
        if len(data_path.parents) <= 4:
            print(f'Warning: cannot parse data at path {data_path} (subfolders missing)')
        data_type = DataType(data_path.stem.split('_')[-1])
        bench_op = Op(data_path.parents[0].name)
        platform = Platform(data_path.parents[1].name)
        device_name = data_path.parents[2].name
        current_data = pd.read_csv(data_path, sep='\t')
        current_data[comp_key.data_type] = data_type.value
        current_data[comp_key.bench_op] = bench_op.value
        current_data[comp_key.platform] = platform.value
        current_data[comp_key.device] = device_name
        if all_data is None:
            all_data = current_data
        else:
            all_data = all_data.append(current_data, ignore_index=True, verify_integrity=True)
    # Compare between platforms
    comp_args = []
    for device in all_data[comp_key.device].unique():
        for bench_op in all_data[comp_key.bench_op].unique():
            comp_args.append((all_data, comp_key, device, bench_op, output_path))
    with mp.Pool(processes=math.ceil(os.cpu_count() * 0.8)) as pool:
        pool.starmap(_draw_comparison, comp_args)
--- a/src/pytorch/add.py
+++ b/src/pytorch/add.py
@ -1,4 +1,5 @@
 from pathlib import Path
 from typing import List, Tuple
 import torch
@ -7,27 +8,20 @@ from src.pytorch.base import TorchBase
 class TorchAddBench(TorchBase):
-    def __init__(self, output_path: Path):
+    def __init__(self, output_path: Path, data_type: DataType):
-        super().__init__(output_path, Op.ADD)
+        super().__init__(output_path, Op.ADD, data_type)
        self.tensor_1: torch.Tensor = None
        self.tensor_2: torch.Tensor = None
        self.tensor_result: torch.Tensor = None
-    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: torch.dtype, device: torch.device):
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1 = experiment_args
-        tensor_1 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
+        self.tensor_1 = torch.ones(shape_1, dtype=self.dtype, device=self.device, requires_grad=False)
-        tensor_2 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
+        self.tensor_2 = torch.ones(shape_1, dtype=self.dtype, device=self.device, requires_grad=False)
        self.tensor_result = self.tensor_1 + self.tensor_2
-        for _ in range(length):
+    def experiment(self):
-            _ = tensor_1 + tensor_2
+        self.tensor_result = self.tensor_1 + self.tensor_2
-    def name(self, experiment_args: tuple[int, int]) -> str:
+    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
-        shape_1 = experiment_args
+        super().run(experiment_args, experiment_count)
        return f'{shape_1[0]}x{shape_1[1]} + {shape_1[0]}x{shape_1[1]}'
    def mop(self, experiment_args: tuple[int, int]) -> float:
        shape_1 = experiment_args
        return shape_1[0] * shape_1[1] / 1000_000
    def run(self,
            experiment_args: list[tuple[int, int]],
            experiment_count: int,
            data_type: DataType):
        super().run(experiment_args, experiment_count, data_type)
--- a/src/pytorch/base.py
+++ b/src/pytorch/base.py
@ -7,7 +7,7 @@ from src.common import DataType, Device, Op, Platform
 class TorchBase(BenchBase):
-    def __init__(self, output_path: Path, bench_op: Op):
+    def __init__(self, output_path: Path, bench_op: Op, data_type: DataType):
        if torch.cuda.is_available():
            if torch.cuda.device_count() > 1:
                print('WARINING : no multiple CUDA device benchmark implemented yet (only using first)')
@ -18,22 +18,16 @@ class TorchBase(BenchBase):
            device_type = Device.CPU
            device = torch.device('cpu')
        super().__init__(output_path, Platform.TORCH, bench_op, device_type, device)
    def get_dtype(self, data_type: DataType) -> torch.dtype:
        if data_type == DataType.FLOAT16:
-            return torch.float16
+            dtype = torch.float16
-        if data_type == DataType.FLOAT32:
+        elif data_type == DataType.FLOAT32:
-            return torch.float32
+            dtype = torch.float32
-        if data_type == DataType.FLOAT64:
+        elif data_type == DataType.FLOAT64:
-            return torch.float64
+            dtype = torch.float64
        else:
            raise NotImplementedError(f'data_type {data_type.value} not implemented')
-    def experiment(self, _experiment_args, _length, _dtype, _device):
+        super().__init__(output_path, Platform.TORCH, bench_op, device_type, device, data_type, dtype)
        raise NotImplementedError()
-    def name(self, _experiment_args) -> str:
+    def experiment(self):
        raise NotImplementedError()
    def mop(self, _experiment_args) -> float:
        raise NotImplementedError()
--- a/src/pytorch/div.py
+++ b/src/pytorch/div.py
@ -1,4 +1,5 @@
 from pathlib import Path
 from typing import List, Tuple
 import torch
@ -7,27 +8,20 @@ from src.pytorch.base import TorchBase
 class TorchDivBench(TorchBase):
-    def __init__(self, output_path: Path):
+    def __init__(self, output_path: Path, data_type: DataType):
-        super().__init__(output_path, Op.DIV)
+        super().__init__(output_path, Op.DIV, data_type)
        self.tensor_1: torch.Tensor = None
        self.tensor_2: torch.Tensor = None
        self.tensor_result: torch.Tensor = None
-    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: torch.dtype, device: torch.device):
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1 = experiment_args
-        tensor_1 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
+        self.tensor_1 = torch.ones(shape_1, dtype=self.dtype, device=self.device, requires_grad=False)
-        tensor_2 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
+        self.tensor_2 = torch.ones(shape_1, dtype=self.dtype, device=self.device, requires_grad=False)
        self.tensor_result = self.tensor_1 / self.tensor_2
-        for _ in range(length):
+    def experiment(self):
-            _ = tensor_1 / tensor_2
+        self.tensor_result = self.tensor_1 / self.tensor_2
-    def name(self, experiment_args: tuple[int, int]) -> str:
+    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
-        shape_1 = experiment_args
+        super().run(experiment_args, experiment_count)
        return f'{shape_1[0]}x{shape_1[1]} / {shape_1[0]}x{shape_1[1]}'
    def mop(self, experiment_args: tuple[int, int]) -> float:
        shape_1 = experiment_args
        return shape_1[0] * shape_1[1] / 1000_000
    def run(self,
            experiment_args: list[tuple[int, int]],
            experiment_count: int,
            data_type: DataType):
        super().run(experiment_args, experiment_count, data_type)
--- a/src/pytorch/matmul.py
+++ b/src/pytorch/matmul.py
@ -1,4 +1,5 @@
 from pathlib import Path
 from typing import List, Tuple
 import torch
@ -7,27 +8,20 @@ from src.pytorch.base import TorchBase
 class TorchMatmulBench(TorchBase):
-    def __init__(self, output_path: Path):
+    def __init__(self, output_path: Path, data_type: DataType):
-        super().__init__(output_path, Op.MATMUL)
+        super().__init__(output_path, Op.MATMUL, data_type)
        self.tensor_1: torch.Tensor = None
        self.tensor_2: torch.Tensor = None
        self.tensor_result: torch.Tensor = None
-    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: torch.dtype, device: torch.device):
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1, shape_2 = experiment_args
-        tensor_1 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
+        self.tensor_1 = torch.ones(shape_1, dtype=self.dtype, device=self.device, requires_grad=False)
-        tensor_2 = torch.ones(shape_2, dtype=dtype, device=device, requires_grad=False)
+        self.tensor_2 = torch.ones(shape_2, dtype=self.dtype, device=self.device, requires_grad=False)
        self.tensor_result = self.tensor_1 @ self.tensor_2
-        for _ in range(length):
+    def experiment(self):
-            _ = tensor_1 @ tensor_2
+        self.tensor_result = self.tensor_1 @ self.tensor_2
-    def name(self, experiment_args: tuple[int, int]) -> str:
+    def run(self, experiment_args: List[Tuple[Tuple[int, int], Tuple[int, int]]], experiment_count: int):
-        shape_1, shape_2 = experiment_args
+        super().run(experiment_args, experiment_count)
        return f'{shape_1[0]}x{shape_1[1]} @ {shape_2[0]}x{shape_2[1]}'
    def mop(self, experiment_args: tuple[int, int]) -> float:
        shape_1, shape_2 = experiment_args
        return (shape_1[0] * shape_2[1] / 1000_000) * 2 * (shape_1[1] - 1)
    def run(self,
            experiment_args: list[tuple[tuple[int, int], tuple[int, int]]],
            experiment_count: int,
            data_type: DataType):
        super().run(experiment_args, experiment_count, data_type)
--- a/src/pytorch/mul.py
+++ b/src/pytorch/mul.py
@ -1,4 +1,5 @@
 from pathlib import Path
 from typing import List, Tuple
 import torch
@ -7,27 +8,20 @@ from src.pytorch.base import TorchBase
 class TorchMulBench(TorchBase):
-    def __init__(self, output_path: Path):
+    def __init__(self, output_path: Path, data_type: DataType):
-        super().__init__(output_path, Op.MUL)
+        super().__init__(output_path, Op.MUL, data_type)
        self.tensor_1: torch.Tensor = None
        self.tensor_2: torch.Tensor = None
        self.tensor_result: torch.Tensor = None
-    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: torch.dtype, device: torch.device):
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1 = experiment_args
-        tensor_1 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
+        self.tensor_1 = torch.ones(shape_1, dtype=self.dtype, device=self.device, requires_grad=False)
-        tensor_2 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
+        self.tensor_2 = torch.ones(shape_1, dtype=self.dtype, device=self.device, requires_grad=False)
        self.tensor_result = self.tensor_1 * self.tensor_2
-        for _ in range(length):
+    def experiment(self):
-            _ = tensor_1 * tensor_2
+        self.tensor_result = self.tensor_1 * self.tensor_2
-    def name(self, experiment_args: tuple[int, int]) -> str:
+    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
-        shape_1 = experiment_args
+        super().run(experiment_args, experiment_count)
        return f'{shape_1[0]}x{shape_1[1]} * {shape_1[0]}x{shape_1[1]}'
    def mop(self, experiment_args: tuple[int, int]) -> float:
        shape_1 = experiment_args
        return shape_1[0] * shape_1[1] / 1000_000
    def run(self,
            experiment_args: list[tuple[int, int]],
            experiment_count: int,
            data_type: DataType):
        super().run(experiment_args, experiment_count, data_type)
--- a/src/pytorch/nn_dense.py
+++ b/src/pytorch/nn_dense.py
@ -0,0 +1,36 @@
 from pathlib import Path
 from typing import List, Tuple
 import torch
 from src.common import DataType, Op
 from src.pytorch.base import TorchBase
 class DenseNetwork(torch.nn.Module):
    def __init__(self, input_dim: int, dtype: torch.dtype):
        super().__init__()
        self.dense = torch.nn.Linear(input_dim, input_dim, dtype=dtype)
    def forward(self, input_data: torch.Tensor) -> torch.Tensor:
        return self.dense(input_data)
 class TorchNNDenseBench(TorchBase):
    def __init__(self, output_path: Path, data_type: DataType):
        super().__init__(output_path, Op.NN_DENSE, data_type)
        self.tensor: torch.Tensor = None
        self.tensor_result: torch.Tensor = None
        self.network: torch.nn.Module = None
    def pre_experiment(self, experiment_args: Tuple[int, int]):
        batch_size, dimension = experiment_args
        self.tensor = torch.ones((batch_size, dimension), dtype=self.dtype, device=self.device, requires_grad=False)
        self.network = DenseNetwork(dimension, self.dtype).to(self.device)
        self.tensor_result = self.network(self.tensor)
    def experiment(self):
        self.tensor_result = self.network(self.tensor)
    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
        super().run(experiment_args, experiment_count)
--- a/src/pytorch/nn_matmul.py
+++ b/src/pytorch/nn_matmul.py
@ -0,0 +1,34 @@
 from pathlib import Path
 from typing import List, Tuple
 import torch
 from src.common import DataType, Op
 from src.pytorch.base import TorchBase
 class MatMulNetwork(torch.nn.Module):
    def forward(self, input_1: torch.Tensor, input_2: torch.Tensor) -> torch.Tensor:
        return input_1 @ input_2
 class TorchNNMatmulBench(TorchBase):
    def __init__(self, output_path: Path, data_type: DataType):
        super().__init__(output_path, Op.NN_MATMUL, data_type)
        self.tensor_1: torch.Tensor = None
        self.tensor_2: torch.Tensor = None
        self.tensor_result: torch.Tensor = None
        self.network: torch.nn.Module = None
    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1, shape_2 = experiment_args
        self.tensor_1 = torch.ones(shape_1, dtype=self.dtype, device=self.device, requires_grad=False)
        self.tensor_2 = torch.ones(shape_2, dtype=self.dtype, device=self.device, requires_grad=False)
        self.network = MatMulNetwork()
        self.tensor_result = self.network(self.tensor_1, self.tensor_2)
    def experiment(self):
        self.tensor_result = self.network(self.tensor_1, self.tensor_2)
    def run(self, experiment_args: List[Tuple[Tuple[int, int], Tuple[int, int]]], experiment_count: int):
        super().run(experiment_args, experiment_count)
--- a/src/pytorch/ops.py
+++ b/src/pytorch/ops.py
@ -6,11 +6,15 @@ from src.pytorch.base import TorchBase
 from src.pytorch.div import TorchDivBench
 from src.pytorch.mul import TorchMulBench
 from src.pytorch.matmul import TorchMatmulBench
 from src.pytorch.nn_dense import TorchNNDenseBench
 from src.pytorch.nn_matmul import TorchNNMatmulBench
 torch_ops: dict[Op, Type[TorchBase]] = {
    Op.ADD: TorchAddBench,
    Op.MUL: TorchMulBench,
    Op.DIV: TorchDivBench,
-    Op.MATMUL: TorchMatmulBench
+    Op.MATMUL: TorchMatmulBench,
    Op.NN_MATMUL: TorchNNMatmulBench,
    Op.NN_DENSE: TorchNNDenseBench
 }
--- a/src/tf_2/add.py
+++ b/src/tf_2/add.py
@ -1,4 +1,5 @@
 from pathlib import Path
 from typing import List, Tuple
 import tensorflow as tf
@ -7,28 +8,21 @@ from src.tf_2.base import TFBase
 class TFAddBench(TFBase):
-    def __init__(self, output_path: Path):
+    def __init__(self, output_path: Path, data_type: DataType):
-        super().__init__(output_path, Op.ADD)
+        super().__init__(output_path, Op.ADD, data_type)
        self.tensor_1: tf.Tensor = None
        self.tensor_2: tf.Tensor = None
        self.tensor_result: tf.Tensor = None
-    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: tf.DType, device: tf.device):
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1 = experiment_args
-        with device:
+        with self.device:
-            tensor_1 = tf.ones(shape_1, dtype=dtype)
+            self.tensor_1 = tf.ones(shape_1, dtype=self.dtype)
-            tensor_2 = tf.ones(shape_1, dtype=dtype)
+            self.tensor_2 = tf.ones(shape_1, dtype=self.dtype)
        self.tensor_result = self.tensor_1 + self.tensor_2
-            for _ in range(length):
+    def experiment(self):
-                _ = tensor_1 + tensor_2
+        self.tensor_result = self.tensor_1 + self.tensor_2
-    def name(self, experiment_args: tuple[int, int]) -> str:
+    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
-        shape_1 = experiment_args
+        super().run(experiment_args, experiment_count)
        return f'{shape_1[0]}x{shape_1[1]} + {shape_1[0]}x{shape_1[1]}'
    def mop(self, experiment_args: tuple[int, int]) -> float:
        shape_1 = experiment_args
        return shape_1[0] * shape_1[1] / 1000_000
    def run(self,
            experiment_args: list[tuple[int, int]],
            experiment_count: int,
            data_type: DataType):
        super().run(experiment_args, experiment_count, data_type)
--- a/src/tf_2/base.py
+++ b/src/tf_2/base.py
@ -7,13 +7,13 @@ from src.common import DataType, Device, Op, Platform
 class TFBase(BenchBase):
-    def __init__(self, output_path: Path, bench_op: Op):
+    def __init__(self, output_path: Path, bench_op: Op, data_type: DataType):
        gpus = tf.config.list_physical_devices('GPU')
        if gpus:
            if len(gpus) > 1:
                print('WARINING : no multiple CUDA device benchmark implemented yet (only using first)')
-            tf.config.experimental.set_memory_growth(gpus[0], True)
+            # tf.config.experimental.set_memory_growth(gpus[0], True)
            tf.config.set_visible_devices(gpus[0], 'GPU')
            # logical_gpus = tf.config.list_logical_devices('GPU')
            device_type = Device.GPU
@ -22,22 +22,16 @@ class TFBase(BenchBase):
            device_type = Device.CPU
            device = tf.device('/CPU:0')
        super().__init__(output_path, Platform.TF2, bench_op, device_type, device)
    def get_dtype(self, data_type: DataType) -> tf.DType:
        if data_type == DataType.FLOAT16:
-            return tf.float16
+            dtype = tf.float16
-        if data_type == DataType.FLOAT32:
+        elif data_type == DataType.FLOAT32:
-            return tf.float32
+            dtype = tf.float32
-        if data_type == DataType.FLOAT64:
+        elif data_type == DataType.FLOAT64:
-            return tf.float64
+            dtype = tf.float64
        else:
            raise RuntimeError(f'data_type {data_type.value} not implemented')
-    def experiment(self, _experiment_args, _length, _dtype, _device):
+        super().__init__(output_path, Platform.TF2, bench_op, device_type, device, data_type, dtype)
        raise NotImplementedError()
-    def name(self, _experiment_args) -> str:
+    def experiment(self):
        raise NotImplementedError()
    def mop(self, _experiment_args) -> float:
        raise NotImplementedError()
--- a/src/tf_2/div.py
+++ b/src/tf_2/div.py
@ -1,4 +1,5 @@
 from pathlib import Path
 from typing import List, Tuple
 import tensorflow as tf
@ -7,28 +8,21 @@ from src.tf_2.base import TFBase
 class TFDivBench(TFBase):
-    def __init__(self, output_path: Path):
+    def __init__(self, output_path: Path, data_type: DataType):
-        super().__init__(output_path, Op.DIV)
+        super().__init__(output_path, Op.DIV, data_type)
        self.tensor_1: tf.Tensor = None
        self.tensor_2: tf.Tensor = None
        self.tensor_result: tf.Tensor = None
-    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: tf.DType, device: tf.device):
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1 = experiment_args
-        with device:
+        with self.device:
-            tensor_1 = tf.ones(shape_1, dtype=dtype)
+            self.tensor_1 = tf.ones(shape_1, dtype=self.dtype)
-            tensor_2 = tf.ones(shape_1, dtype=dtype)
+            self.tensor_2 = tf.ones(shape_1, dtype=self.dtype)
        self.tensor_result = self.tensor_1 / self.tensor_2
-            for _ in range(length):
+    def experiment(self):
-                _ = tensor_1 / tensor_2
+        self.tensor_result = self.tensor_1 / self.tensor_2
-    def name(self, experiment_args: tuple[int, int]) -> str:
+    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
-        shape_1 = experiment_args
+        super().run(experiment_args, experiment_count)
        return f'{shape_1[0]}x{shape_1[1]} / {shape_1[0]}x{shape_1[1]}'
    def mop(self, experiment_args: tuple[int, int]) -> float:
        shape_1 = experiment_args
        return shape_1[0] * shape_1[1] / 1000_000
    def run(self,
            experiment_args: list[tuple[int, int]],
            experiment_count: int,
            data_type: DataType):
        super().run(experiment_args, experiment_count, data_type)
--- a/src/tf_2/matmul.py
+++ b/src/tf_2/matmul.py
@ -1,4 +1,5 @@
 from pathlib import Path
 from typing import List, Tuple
 import tensorflow as tf
@ -7,28 +8,21 @@ from src.tf_2.base import TFBase
 class TFMatmulBench(TFBase):
-    def __init__(self, output_path: Path):
+    def __init__(self, output_path: Path, data_type: DataType):
-        super().__init__(output_path, Op.MATMUL)
+        super().__init__(output_path, Op.MATMUL, data_type)
        self.tensor_1: tf.Tensor = None
        self.tensor_2: tf.Tensor = None
        self.tensor_result: tf.Tensor = None
-    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: tf.DType, device: tf.device):
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1, shape_2 = experiment_args
-        with device:
+        with self.device:
-            tensor_1 = tf.ones(shape_1, dtype=dtype)
+            self.tensor_1 = tf.ones(shape_1, dtype=self.dtype)
-            tensor_2 = tf.ones(shape_2, dtype=dtype)
+            self.tensor_2 = tf.ones(shape_2, dtype=self.dtype)
        self.tensor_result = self.tensor_1 @ self.tensor_2
-            for _ in range(length):
+    def experiment(self):
-                _ = tensor_1 @ tensor_2
+        self.tensor_result = self.tensor_1 @ self.tensor_2
-    def name(self, experiment_args: tuple[int, int]) -> str:
+    def run(self, experiment_args: List[Tuple[Tuple[int, int], Tuple[int, int]]], experiment_count: int):
-        shape_1, shape_2 = experiment_args
+        super().run(experiment_args, experiment_count)
        return f'{shape_1[0]}x{shape_1[1]} @ {shape_2[0]}x{shape_2[1]}'
    def mop(self, experiment_args: tuple[int, int]) -> float:
        shape_1, shape_2 = experiment_args
        return (shape_1[0] * shape_2[1] / 1000_000) * 2 * (shape_1[1] - 1)
    def run(self,
            experiment_args: list[tuple[tuple[int, int], tuple[int, int]]],
            experiment_count: int,
            data_type: DataType):
        super().run(experiment_args, experiment_count, data_type)
--- a/src/tf_2/mul.py
+++ b/src/tf_2/mul.py
@ -1,4 +1,5 @@
 from pathlib import Path
 from typing import List, Tuple
 import tensorflow as tf
@ -7,28 +8,21 @@ from src.tf_2.base import TFBase
 class TFMulBench(TFBase):
-    def __init__(self, output_path: Path):
+    def __init__(self, output_path: Path, data_type: DataType):
-        super().__init__(output_path, Op.MUL)
+        super().__init__(output_path, Op.MUL, data_type)
        self.tensor_1: tf.Tensor = None
        self.tensor_2: tf.Tensor = None
        self.tensor_result: tf.Tensor = None
-    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: tf.DType, device: tf.device):
+    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1 = experiment_args
-        with device:
+        with self.device:
-            tensor_1 = tf.ones(shape_1, dtype=dtype)
+            self.tensor_1 = tf.ones(shape_1, dtype=self.dtype)
-            tensor_2 = tf.ones(shape_1, dtype=dtype)
+            self.tensor_2 = tf.ones(shape_1, dtype=self.dtype)
        self.tensor_result = self.tensor_1 * self.tensor_2
-            for _ in range(length):
+    def experiment(self):
-                _ = tensor_1 * tensor_2
+        self.tensor_result = self.tensor_1 * self.tensor_2
-    def name(self, experiment_args: tuple[int, int]) -> str:
+    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
-        shape_1 = experiment_args
+        super().run(experiment_args, experiment_count)
        return f'{shape_1[0]}x{shape_1[1]} * {shape_1[0]}x{shape_1[1]}'
    def mop(self, experiment_args: tuple[int, int]) -> float:
        shape_1 = experiment_args
        return shape_1[0] * shape_1[1] / 1000_000
    def run(self,
            experiment_args: list[tuple[int, int]],
            experiment_count: int,
            data_type: DataType):
        super().run(experiment_args, experiment_count, data_type)
--- a/src/tf_2/nn_dense.py
+++ b/src/tf_2/nn_dense.py
@ -0,0 +1,35 @@
 from pathlib import Path
 from typing import List, Tuple
 import tensorflow as tf
 from src.common import DataType, Op
 from src.tf_2.base import TFBase
 class DenseModel(tf.keras.Model):
    def __init__(self, input_dim: int, dtype=tf.DType):
        super().__init__()
        self.dense = tf.keras.layers.Dense(input_dim, dtype=dtype)
    def call(self, input_tensor: tf.Tensor) -> tf.Tensor:
        return self.dense(input_tensor)
 class TFNNDenseBench(TFBase):
    def __init__(self, output_path: Path, data_type: DataType):
        super().__init__(output_path, Op.NN_DENSE, data_type)
        self.tensor: tf.Tensor = None
        self.network: tf.keras.Model = None
    def pre_experiment(self, experiment_args: Tuple[int, int]):
        batch_size, dimension = experiment_args
        with self.device:
            self.tensor = tf.ones((batch_size, dimension), dtype=self.dtype)
            self.network = DenseModel(dimension, self.dtype)
    def experiment(self):
        self.network(self.tensor)
    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
        super().run(experiment_args, experiment_count)
--- a/src/tf_2/nn_matmul.py
+++ b/src/tf_2/nn_matmul.py
@ -0,0 +1,34 @@
 from pathlib import Path
 from typing import List, Tuple
 import tensorflow as tf
 from src.common import DataType, Op
 from src.tf_2.base import TFBase
 class MatmulModel(tf.keras.Model):
    def call(self, tensor_1: tf.Tensor, tensor_2: tf.Tensor) -> tf.Tensor:
        return tf.matmul(tensor_1, tensor_2)
 class TFNNMatmulBench(TFBase):
    def __init__(self, output_path: Path, data_type: DataType):
        super().__init__(output_path, Op.NN_MATMUL, data_type)
        self.tensor_1: tf.Tensor = None
        self.tensor_2: tf.Tensor = None
        self.tensor_result: tf.Tensor = None
        self.network: tf.keras.Model = None
    def pre_experiment(self, experiment_args: Tuple[int, int]):
        shape_1, shape_2 = experiment_args
        with self.device:
            self.tensor_1 = tf.ones(shape_1, dtype=self.dtype)
            self.tensor_2 = tf.ones(shape_2, dtype=self.dtype)
            self.network = MatmulModel()
    def experiment(self):
        self.tensor_result = self.network(self.tensor_1, self.tensor_2)
    def run(self, experiment_args: List[Tuple[Tuple[int, int], Tuple[int, int]]], experiment_count: int):
        super().run(experiment_args, experiment_count)
--- a/src/tf_2/ops.py
+++ b/src/tf_2/ops.py
@ -6,11 +6,15 @@ from src.tf_2.base import TFBase
 from src.tf_2.div import TFDivBench
 from src.tf_2.mul import TFMulBench
 from src.tf_2.matmul import TFMatmulBench
 from src.tf_2.nn_dense import TFNNDenseBench
 from src.tf_2.nn_matmul import TFNNMatmulBench
 tf2_ops: dict[Op, Type[TFBase]] = {
    Op.ADD: TFAddBench,
    Op.MUL: TFMulBench,
    Op.DIV: TFDivBench,
-    Op.MATMUL: TFMatmulBench
+    Op.MATMUL: TFMatmulBench,
    Op.NN_MATMUL: TFNNMatmulBench,
    Op.NN_DENSE: TFNNDenseBench
 }
--- a/src/tf_2_v1/add.py
+++ b/src/tf_2_v1/add.py
@ -0,0 +1,30 @@
 from pathlib import Path
 from typing import List, Tuple
 import tensorflow.compat.v1 as tf
 from src.common import DataType, Op
 from src.tf_2_v1.base import TFBase
 class TFAddBench(TFBase):
    def __init__(self, output_path: Path, data_type: DataType):
        super().__init__(output_path, Op.ADD, data_type)
        self.add_op = None
    def pre_experiment(self, experiment_args: Tuple[int, int]):
        super().pre_experiment(experiment_args)
        shape_1 = experiment_args
        tensor_1 = tf.get_variable('tensor_1', shape=shape_1, dtype=self.dtype,
                                   initializer=tf.initializers.ones, trainable=False)
        tensor_2 = tf.get_variable('tensor_2', shape=shape_1, dtype=self.dtype,
                                   initializer=tf.initializers.ones, trainable=False)
        self.add_op = tensor_1 + tensor_2
        self.session.run(tf.initializers.global_variables())
    def experiment(self):
        self.session.run(self.add_op)
    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
        super().run(experiment_args, experiment_count)
--- a/src/tf_2_v1/base.py
+++ b/src/tf_2_v1/base.py
@ -0,0 +1,43 @@
 from pathlib import Path
 import tensorflow.compat.v1 as tf
 from src.base import BenchBase
 from src.common import DataType, Device, Op, Platform
 class TFBase(BenchBase):
    def __init__(self, output_path: Path, bench_op: Op, data_type: DataType):
        if data_type == DataType.FLOAT16:
            dtype = tf.float16
        elif data_type == DataType.FLOAT32:
            dtype = tf.float32
        elif data_type == DataType.FLOAT64:
            dtype = tf.float64
        else:
            raise RuntimeError(f'data_type {data_type.value} not implemented')
        super().__init__(output_path, Platform.TF2_V1, bench_op, Device.GPU, None, data_type, dtype)
        self.session: tf.Session = None
    def pre_experiment(self, _experiment_args):
        tf.disable_v2_behavior()
        # tf.disable_eager_execution()
        # gpu_options = tf.GPUOptions(allow_growth=True)
        # session_config = tf.ConfigProto(gpu_options=gpu_options)
        # self.session = tf.Session(config=session_config)
        self.session = tf.Session()
        self.session.as_default()
    def post_experiment(self):
        self.session.close()
        tf.reset_default_graph()
    def experiment(self):
        raise NotImplementedError()
    def name(self, _experiment_args) -> str:
        raise NotImplementedError()
    def mop(self, _experiment_args) -> float:
        raise NotImplementedError()
--- a/src/tf_2_v1/div.py
+++ b/src/tf_2_v1/div.py
@ -0,0 +1,30 @@
 from pathlib import Path
 from typing import List, Tuple
 import tensorflow.compat.v1 as tf
 from src.common import DataType, Op
 from src.tf_2_v1.base import TFBase
 class TFDivBench(TFBase):
    def __init__(self, output_path: Path, data_type: DataType):
        super().__init__(output_path, Op.DIV, data_type)
        self.div_op = None
    def pre_experiment(self, experiment_args: Tuple[int, int]):
        super().pre_experiment(experiment_args)
        shape_1 = experiment_args
        tensor_1 = tf.get_variable('tensor_1', shape=shape_1, dtype=self.dtype,
                                   initializer=tf.initializers.ones, trainable=False)
        tensor_2 = tf.get_variable('tensor_2', shape=shape_1, dtype=self.dtype,
                                   initializer=tf.initializers.ones, trainable=False)
        self.div_op = tensor_1 / tensor_2
        self.session.run(tf.initializers.global_variables())
    def experiment(self):
        self.session.run(self.div_op)
    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
        super().run(experiment_args, experiment_count)
--- a/src/tf_2_v1/matmul.py
+++ b/src/tf_2_v1/matmul.py
@ -0,0 +1,30 @@
 from pathlib import Path
 from typing import List, Tuple
 import tensorflow.compat.v1 as tf
 from src.common import DataType, Op
 from src.tf_2_v1.base import TFBase
 class TFMatmulBench(TFBase):
    def __init__(self, output_path: Path, data_type: DataType):
        super().__init__(output_path, Op.MATMUL, data_type)
        self.matmul_op = None
    def pre_experiment(self, experiment_args: Tuple[int, int]):
        super().pre_experiment(experiment_args)
        shape_1, shape_2 = experiment_args
        tensor_1 = tf.get_variable('tensor_1', shape=shape_1, dtype=self.dtype,
                                   initializer=tf.initializers.ones, trainable=False)
        tensor_2 = tf.get_variable('tensor_2', shape=shape_2, dtype=self.dtype,
                                   initializer=tf.initializers.ones, trainable=False)
        self.matmul_op = tensor_1 @ tensor_2
        self.session.run(tf.initializers.global_variables())
    def experiment(self):
        self.session.run(self.matmul_op)
    def run(self, experiment_args: List[Tuple[Tuple[int, int], Tuple[int, int]]], experiment_count: int):
        super().run(experiment_args, experiment_count)
--- a/src/tf_2_v1/mul.py
+++ b/src/tf_2_v1/mul.py
@ -0,0 +1,30 @@
 from pathlib import Path
 from typing import List, Tuple
 import tensorflow.compat.v1 as tf
 from src.common import DataType, Op
 from src.tf_2_v1.base import TFBase
 class TFMulBench(TFBase):
    def __init__(self, output_path: Path, data_type: DataType):
        super().__init__(output_path, Op.MUL, data_type)
        self.mul_op = None
    def pre_experiment(self, experiment_args: Tuple[int, int]):
        super().pre_experiment(experiment_args)
        shape_1 = experiment_args
        tensor_1 = tf.get_variable('tensor_1', shape=shape_1, dtype=self.dtype,
                                   initializer=tf.initializers.ones, trainable=False)
        tensor_2 = tf.get_variable('tensor_2', shape=shape_1, dtype=self.dtype,
                                   initializer=tf.initializers.ones, trainable=False)
        self.mul_op = tensor_1 * tensor_2
        self.session.run(tf.initializers.global_variables())
    def experiment(self):
        self.session.run(self.mul_op)
    def run(self, experiment_args: List[Tuple[int, int]], experiment_count: int):
        super().run(experiment_args, experiment_count)
--- a/src/tf_2_v1/ops.py
+++ b/src/tf_2_v1/ops.py
@ -0,0 +1,16 @@
 from typing import Type
 from src.common import Op
 from src.tf_2_v1.add import TFAddBench
 from src.tf_2_v1.base import TFBase
 from src.tf_2_v1.div import TFDivBench
 from src.tf_2_v1.mul import TFMulBench
 from src.tf_2_v1.matmul import TFMatmulBench
 tf2v1_ops: dict[Op, Type[TFBase]] = {
    Op.ADD: TFAddBench,
    Op.MUL: TFMulBench,
    Op.DIV: TFDivBench,
    Op.MATMUL: TFMatmulBench
 }