Implement TF2 and add, mul and div benchmark

2021-09-28 02:59:53 +09:00 · 2021-09-28 02:59:53 +09:00 · 4b2bcfe7e8
commit 4b2bcfe7e8
parent fbf6898dd9
18 changed files with 649 additions and 171 deletions
--- a/benchmark.py
+++ b/benchmark.py
@ -1,39 +1,99 @@
 from argparse import ArgumentParser
 import multiprocessing as mp
 import os
 from pathlib import Path
 from typing import Type
-from src.base import DataType
+from src.base import BenchBase
-from src.torch.matmul import TorchMatmulBench
+from src.common import DataType, Op, Platform
 def run_benchmark(output_path: Path, platform: Platform, data_type: DataType, bench_op: Op,
                  bench_args, bench_count: int):
    if platform == Platform.TF2:
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
        from src.tf_2.ops import tf2_ops
        if bench_op not in tf2_ops:
            print(f'Operation {bench_op.value} is not implemented for {platform.value} yet')
        else:
            tf2_ops[bench_op](output_path).run(bench_args, bench_count, data_type)
            print()
    elif platform == Platform.TORCH:
        from src.pytorch.ops import torch_ops
        if bench_op not in torch_ops:
            print(f'Operation {bench_op.value} is not implemented for {platform.value} yet')
        else:
            torch_ops[bench_op](output_path).run(bench_args, bench_count, data_type)
            print()
    else:
        print(f'Platform {platform.value} is not implemented yet')
 def main():
    parser = ArgumentParser()
    parser.add_argument('--output', type=Path, default=Path('output'), help='Path to output files')
    parser.add_argument('--count', type=int, default=30,
                        help='Number of experiments per benchmark (for stastistical analysis)')
    parser.add_argument('--platform', nargs='*', type=Platform,
                        help='List of platform to benchmark [TF1, TF2, Torch] (else all are used)')
    parser.add_argument('--data', nargs='*', type=DataType,
                        help='List of data type to benchmark [float16, float32, float64] (else all are used)')
    parser.add_argument('--op', nargs='*', type=Op,
                        help='List of operation to benchmark [add, mul, div, matmul] (else all are used)')
    arguments = parser.parse_args()
    output_path: Path = arguments.output
    bench_count: int = arguments.count
    platforms: list[Platform] = arguments.platform if arguments.platform is not None else list(Platform)
    data: list[DataType] = arguments.data if arguments.data is not None else list(DataType)
    bench_ops: list[Op] = arguments.op if arguments.op is not None else list(Op)
    if not output_path.exists():
        output_path.mkdir(parents=True)
-    for data_type in DataType:
+    benchmarks: list[dict[Op, Type[BenchBase]]] = []
-        TorchMatmulBench(output_path).run(
+    element_wise_args = [
-            [
+        (100, 100),
-                ((100, 100), (100, 100)),
+        (100, 200),
-                ((100, 200), (200, 100)),
+        (128, 128),
-                ((128, 128), (128, 128)),
+        (200, 100),
-                ((200, 100), (100, 200)),
+        (200, 200),
-                ((200, 200), (200, 200)),
+        (256, 256),
-                ((256, 256), (256, 256)),
+        (256, 512),
-                ((256, 512), (512, 256)),
+        (512, 256),
-                ((400, 400), (400, 400)),
+        (400, 400),
-                ((512, 256), (256, 512)),
+        (512, 512),
-                ((512, 512), (512, 512)),
+        (800, 800),
-                ((800, 800), (800, 800)),
+        (1024, 1024),
-                ((1000, 1000), (1000, 1000)),
+        (1800, 1800)]
-                ((1200, 1200), (1200, 1200)),
+    matmul_args = [
-            ],
+        ((100, 100), (100, 100)),
-            12,
+        ((100, 200), (200, 100)),
-            data_type)
+        ((128, 128), (128, 128)),
        ((200, 100), (100, 200)),
        ((200, 200), (200, 200)),
        ((256, 256), (256, 256)),
        ((256, 512), (512, 256)),
        ((400, 400), (400, 400)),
        ((512, 256), (256, 512)),
        ((512, 512), (512, 512)),
        ((800, 800), (800, 800)),
        ((1000, 1000), (1000, 1000)),
        ((1200, 1200), (1200, 1200))]
    for platform in platforms:
        for data_type in data:
            for bench_op in [Op.ADD, Op.MUL, Op.DIV]:
                if bench_op in bench_ops:
                    benchmarks.append((output_path, platform, data_type, bench_op, element_wise_args, bench_count))
            if Op.MATMUL in bench_ops:
                benchmarks.append((output_path, platform, data_type, Op.MATMUL, matmul_args, bench_count))
    for benchmark in benchmarks:
        process = mp.Process(target=run_benchmark, args=benchmark)
        process.start()
        process.join()
    print('Benchmark done')
--- a/src/base.py
+++ b/src/base.py
@ -1,22 +1,106 @@
 from pathlib import Path
-from enum import Enum
+import time
 import numpy as np
 import pandas as pd
 from src.common import DataKey, DataType, Device, Op, Platform
 from src.plot import plot_experiments
 from src.utils import get_cpu_name, get_nvidia_name
-class Device(Enum):
+class BenchBase():
-    CPU = 'cpu'
+    def __init__(self, output_path: Path, platform: Platform, bench_op: Op, device_type: Device, device):
    GPU = 'gpu'
 class DataType(Enum):
    FLOAT16 = 'float16'
    FLOAT32 = 'float32'
    FLOAT64 = 'float64'
 class Base():
    def __init__(self, output_path: Path):
        self._base_output_path = output_path
        self.output_path = output_path
        self.platform = platform
        self.bench_op = bench_op
        self.device_type = device_type
        self.device = device
        self.dtype = None
    def set_output_path(self, device: Device, device_name: str):
-        self.output_path = self._base_output_path / f'{device.value}_{device_name}'
+        self.output_path = (
            self._base_output_path / f'{device.value}_{device_name}' / self.platform.value / self.bench_op.value)
    def get_dtype(self, data_type: DataType):
        raise NotImplementedError()
    def experiment(self, _experiment_args, _length, _dtype, _device):
        raise NotImplementedError()
    def name(self, _experiment_args) -> str:
        raise NotImplementedError()
    def mop(self, _experiment_args) -> float:
        raise NotImplementedError()
    def run(self, experiment_args, experiment_count: int, data_type: DataType):
        self.set_output_path(self.device_type, get_cpu_name() if self.device_type == Device.CPU else get_nvidia_name())
        if not self.output_path.exists():
            self.output_path.mkdir(parents=True)
        dtype = self.get_dtype(data_type)
        print(f'Starting {self.platform.value}\'s {self.bench_op.value} benchmark with data type: {data_type.value}')
        experiment_names = []
        experiment_lengths = []
        experiment_times = []
        experiment_mop = []
        for args in experiment_args:
            # warmup
            for _ in range(4):
                self.experiment(args, 5, dtype, self.device)
            # speed evalutaion
            counter = 0
            start_time = time.time()
            while time.time() - start_time < 0.2:
                self.experiment(args, 10, dtype, self.device)
                counter += 10
            end_time = time.time()
            target_time = 1.0  # in s
            experiment_speed = counter / (end_time - start_time)  # in op/s
            experiment_length = max(int(target_time / experiment_count * experiment_speed), 2)
            # print(f'Evaluated {counter} {self.bench_op.value} in {end_time - start_time:0.3f}s'
            #       f' => {experiment_speed:.03f}{self.bench_op.value}/s'
            #       f', estimate {target_time:.03f}s with {experiment_length}x{experiment_count} exps')
            run_times = []
            for _ in range(experiment_count):
                start_time = time.time()
                self.experiment(args, experiment_length, dtype, self.device)
                run_times.append(time.time() - start_time)
            experiment_times += run_times
            experiment_names += [self.name(args)] * experiment_count
            experiment_lengths += [experiment_length] * experiment_count
            experiment_mop += [self.mop(args)] * experiment_count
            total_time = np.array(run_times, dtype=np.float64).sum()
            total_glop = self.mop(args) * experiment_length * experiment_count / 1000
            print(f'Run {experiment_names[-1]} (x{experiment_length})'
                  f' in {total_time:0.2f}s => {total_glop / total_time:0.3f}GFOPS')
        data = self.save_experiments(experiment_names, experiment_times, experiment_lengths, experiment_mop, data_type)
        plot_experiments(self.output_path, data, data_type, self.bench_op, self.platform)
    def save_experiments(
            self, experiment_names: list[str], experiment_times: list[float],
            experiment_lengths: list[int], experiment_mop: list[float], data_type: DataType) -> pd.DataFrame:
        key = DataKey(self.bench_op)
        data = pd.DataFrame(
            {
                key.experiment: experiment_names,
                key.time: experiment_times,
                key.count: experiment_lengths,
                key.speed: [(1000.0 * t) / l for t, l in zip(experiment_times, experiment_lengths)],
                key.mop: experiment_mop,
                key.gflops: [(mop * l) / (t * 1000.0)
                             for mop, l, t in zip(experiment_mop, experiment_lengths, experiment_times)]
            })
        data.to_csv(self.output_path / f'{self.bench_op.value}_{data_type.value}.csv', sep='\t')
        return data
--- a/src/common.py
+++ b/src/common.py
@ -0,0 +1,36 @@
 from enum import Enum
 class Device(Enum):
    CPU = 'cpu'
    GPU = 'gpu'
 class DataType(Enum):
    FLOAT16 = 'float16'
    FLOAT32 = 'float32'
    FLOAT64 = 'float64'
 class Op(Enum):
    NO_OP = 'noop'
    ADD = 'add'
    DIV = 'div'
    MUL = 'mul'
    MATMUL = 'matmul'
 class Platform(Enum):
    TF1 = 'TF1'
    TF2 = 'TF2'
    TORCH = 'Torch'
 class DataKey():
    def __init__(self, bench_op: Op):
        self.experiment = 'experiment'
        self.time = 'run times (s)'
        self.count = 'count'
        self.mop = f'Mop/{bench_op.value}'
        self.speed = f'ms/{bench_op.value}'
        self.gflops = 'GFLOPS'
--- a/src/plot.py
+++ b/src/plot.py
@ -0,0 +1,51 @@
 from pathlib import Path
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 from src.common import DataKey, DataType, Op, Platform
 def plot_experiments(output_path: Path, data: pd.DataFrame, data_type: DataType, bench_op: Op, platform: Platform):
    key = DataKey(bench_op)
    sum_data = data[[key.experiment, key.time, key.count]].groupby(
        key.experiment, as_index=False, sort=False).sum()
    mean_data = data[[key.experiment, key.speed]].groupby(
        key.experiment, as_index=False, sort=False).mean()
    max_data = data[[key.experiment, key.mop]].groupby(
        key.experiment, as_index=False, sort=False).max()
    sns.set_theme(style="ticks")
    figure, axes = plt.subplots(nrows=3, sharex=True, figsize=(18, 12))
    figure.suptitle(f'{platform.value} {bench_op.value} ({data_type.value})', fontsize=16)
    for axe in axes[:-1]:
        axe.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
    chart = sns.barplot(x=key.experiment, y=key.mop, data=max_data, ax=axes[0], order=data[key.experiment].unique())
    axes[0].set_yscale("log")
    for patch, value in zip(chart.patches, max_data[key.mop]):
        chart.annotate(f'{value:0.3f}',
                       (patch.get_x() + patch.get_width() / 2.0, patch.get_height()),
                       ha='center', va='center', fontsize=10, color='black', xytext=(0, 5),
                       textcoords='offset points')
    chart = sns.barplot(x=key.experiment, y=key.speed, data=data, estimator=np.median, ax=axes[1])
    for patch, value in zip(chart.patches, mean_data[key.speed]):
        chart.annotate(f'{value:.3f}',
                       (patch.get_x() + patch.get_width() / 2.0, patch.get_height()),
                       ha='center', va='center', fontsize=10, color='black', xytext=(0, 5),
                       textcoords='offset points')
    chart = sns.barplot(x=key.experiment, y=key.gflops, data=data, estimator=np.median, ax=axes[2])
    for patch, mop, count, value in zip(chart.patches, max_data[key.mop], sum_data[key.count], sum_data[key.time]):
        chart.annotate(f'{(mop * count / 1000) / value:.3f}',
                       (patch.get_x() + patch.get_width() / 2.0, patch.get_height()),
                       ha='center', va='center', fontsize=10, color='black', xytext=(0, 5),
                       textcoords='offset points')
    plt.xticks(rotation=20)
    plt.subplots_adjust(hspace=0.0, wspace=0.02, top=0.93, right=0.99, bottom=0.1, left=0.05)
    plt.savefig(output_path / f'{bench_op.value}_{data_type.value}.png')
--- a/src/pytorch/add.py
+++ b/src/pytorch/add.py
@ -0,0 +1,33 @@
 from pathlib import Path
 import torch
 from src.common import DataType, Op
 from src.pytorch.base import TorchBase
 class TorchAddBench(TorchBase):
    def __init__(self, output_path: Path):
        super().__init__(output_path, Op.ADD)
    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: torch.dtype, device: torch.device):
        shape_1 = experiment_args
        tensor_1 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
        tensor_2 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
        for _ in range(length):
            _ = tensor_1 + tensor_2
    def name(self, experiment_args: tuple[int, int]) -> str:
        shape_1 = experiment_args
        return f'{shape_1[0]}x{shape_1[1]} + {shape_1[0]}x{shape_1[1]}'
    def mop(self, experiment_args: tuple[int, int]) -> float:
        shape_1 = experiment_args
        return shape_1[0] * shape_1[1] / 1000_000
    def run(self,
            experiment_args: list[tuple[int, int]],
            experiment_count: int,
            data_type: DataType):
        super().run(experiment_args, experiment_count, data_type)
--- a/src/pytorch/base.py
+++ b/src/pytorch/base.py
@ -0,0 +1,39 @@
 from pathlib import Path
 import torch
 from src.base import BenchBase
 from src.common import DataType, Device, Op, Platform
 class TorchBase(BenchBase):
    def __init__(self, output_path: Path, bench_op: Op):
        if torch.cuda.is_available():
            if torch.cuda.device_count() > 1:
                print('WARINING : no multiple CUDA device benchmark implemented yet (only using first)')
            torch.backends.cudnn.benchmark = True
            device_type = Device.GPU
            device = torch.device('cuda:0')
        else:
            device_type = Device.CPU
            device = torch.device('cpu')
        super().__init__(output_path, Platform.TORCH, bench_op, device_type, device)
    def get_dtype(self, data_type: DataType) -> torch.dtype:
        if data_type == DataType.FLOAT16:
            return torch.float16
        if data_type == DataType.FLOAT32:
            return torch.float32
        if data_type == DataType.FLOAT64:
            return torch.float64
        raise NotImplementedError(f'data_type {data_type.value} not implemented')
    def experiment(self, _experiment_args, _length, _dtype, _device):
        raise NotImplementedError()
    def name(self, _experiment_args) -> str:
        raise NotImplementedError()
    def mop(self, _experiment_args) -> float:
        raise NotImplementedError()
--- a/src/pytorch/div.py
+++ b/src/pytorch/div.py
@ -0,0 +1,33 @@
 from pathlib import Path
 import torch
 from src.common import DataType, Op
 from src.pytorch.base import TorchBase
 class TorchDivBench(TorchBase):
    def __init__(self, output_path: Path):
        super().__init__(output_path, Op.DIV)
    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: torch.dtype, device: torch.device):
        shape_1 = experiment_args
        tensor_1 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
        tensor_2 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
        for _ in range(length):
            _ = tensor_1 / tensor_2
    def name(self, experiment_args: tuple[int, int]) -> str:
        shape_1 = experiment_args
        return f'{shape_1[0]}x{shape_1[1]} / {shape_1[0]}x{shape_1[1]}'
    def mop(self, experiment_args: tuple[int, int]) -> float:
        shape_1 = experiment_args
        return shape_1[0] * shape_1[1] / 1000_000
    def run(self,
            experiment_args: list[tuple[int, int]],
            experiment_count: int,
            data_type: DataType):
        super().run(experiment_args, experiment_count, data_type)
--- a/src/pytorch/matmul.py
+++ b/src/pytorch/matmul.py
@ -0,0 +1,33 @@
 from pathlib import Path
 import torch
 from src.common import DataType, Op
 from src.pytorch.base import TorchBase
 class TorchMatmulBench(TorchBase):
    def __init__(self, output_path: Path):
        super().__init__(output_path, Op.MATMUL)
    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: torch.dtype, device: torch.device):
        shape_1, shape_2 = experiment_args
        tensor_1 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
        tensor_2 = torch.ones(shape_2, dtype=dtype, device=device, requires_grad=False)
        for _ in range(length):
            _ = tensor_1 @ tensor_2
    def name(self, experiment_args: tuple[int, int]) -> str:
        shape_1, shape_2 = experiment_args
        return f'{shape_1[0]}x{shape_1[1]} @ {shape_2[0]}x{shape_2[1]}'
    def mop(self, experiment_args: tuple[int, int]) -> float:
        shape_1, shape_2 = experiment_args
        return (shape_1[0] * shape_2[1] / 1000_000) * 2 * (shape_1[1] - 1)
    def run(self,
            experiment_args: list[tuple[tuple[int, int], tuple[int, int]]],
            experiment_count: int,
            data_type: DataType):
        super().run(experiment_args, experiment_count, data_type)
--- a/src/pytorch/mul.py
+++ b/src/pytorch/mul.py
@ -0,0 +1,33 @@
 from pathlib import Path
 import torch
 from src.common import DataType, Op
 from src.pytorch.base import TorchBase
 class TorchMulBench(TorchBase):
    def __init__(self, output_path: Path):
        super().__init__(output_path, Op.MUL)
    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: torch.dtype, device: torch.device):
        shape_1 = experiment_args
        tensor_1 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
        tensor_2 = torch.ones(shape_1, dtype=dtype, device=device, requires_grad=False)
        for _ in range(length):
            _ = tensor_1 * tensor_2
    def name(self, experiment_args: tuple[int, int]) -> str:
        shape_1 = experiment_args
        return f'{shape_1[0]}x{shape_1[1]} * {shape_1[0]}x{shape_1[1]}'
    def mop(self, experiment_args: tuple[int, int]) -> float:
        shape_1 = experiment_args
        return shape_1[0] * shape_1[1] / 1000_000
    def run(self,
            experiment_args: list[tuple[int, int]],
            experiment_count: int,
            data_type: DataType):
        super().run(experiment_args, experiment_count, data_type)
--- a/src/pytorch/ops.py
+++ b/src/pytorch/ops.py
@ -0,0 +1,16 @@
 from typing import Type
 from src.common import Op
 from src.pytorch.add import TorchAddBench
 from src.pytorch.base import TorchBase
 from src.pytorch.div import TorchDivBench
 from src.pytorch.mul import TorchMulBench
 from src.pytorch.matmul import TorchMatmulBench
 torch_ops: dict[Op, Type[TorchBase]] = {
    Op.ADD: TorchAddBench,
    Op.MUL: TorchMulBench,
    Op.DIV: TorchDivBench,
    Op.MATMUL: TorchMatmulBench
 }
--- a/src/tf_2/add.py
+++ b/src/tf_2/add.py
@ -0,0 +1,34 @@
 from pathlib import Path
 import tensorflow as tf
 from src.common import DataType, Op
 from src.tf_2.base import TFBase
 class TFAddBench(TFBase):
    def __init__(self, output_path: Path):
        super().__init__(output_path, Op.ADD)
    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: tf.DType, device: tf.device):
        shape_1 = experiment_args
        with device:
            tensor_1 = tf.ones(shape_1, dtype=dtype)
            tensor_2 = tf.ones(shape_1, dtype=dtype)
            for _ in range(length):
                _ = tensor_1 + tensor_2
    def name(self, experiment_args: tuple[int, int]) -> str:
        shape_1 = experiment_args
        return f'{shape_1[0]}x{shape_1[1]} + {shape_1[0]}x{shape_1[1]}'
    def mop(self, experiment_args: tuple[int, int]) -> float:
        shape_1 = experiment_args
        return shape_1[0] * shape_1[1] / 1000_000
    def run(self,
            experiment_args: list[tuple[int, int]],
            experiment_count: int,
            data_type: DataType):
        super().run(experiment_args, experiment_count, data_type)
--- a/src/tf_2/base.py
+++ b/src/tf_2/base.py
@ -0,0 +1,43 @@
 from pathlib import Path
 import tensorflow as tf
 from src.base import BenchBase
 from src.common import DataType, Device, Op, Platform
 class TFBase(BenchBase):
    def __init__(self, output_path: Path, bench_op: Op):
        gpus = tf.config.list_physical_devices('GPU')
        if gpus:
            if len(gpus) > 1:
                print('WARINING : no multiple CUDA device benchmark implemented yet (only using first)')
            tf.config.experimental.set_memory_growth(gpus[0], True)
            tf.config.set_visible_devices(gpus[0], 'GPU')
            # logical_gpus = tf.config.list_logical_devices('GPU')
            device_type = Device.GPU
            device = tf.device('/GPU:0')
        else:
            device_type = Device.CPU
            device = tf.device('/CPU:0')
        super().__init__(output_path, Platform.TF2, bench_op, device_type, device)
    def get_dtype(self, data_type: DataType) -> tf.DType:
        if data_type == DataType.FLOAT16:
            return tf.float16
        if data_type == DataType.FLOAT32:
            return tf.float32
        if data_type == DataType.FLOAT64:
            return tf.float64
        raise RuntimeError(f'data_type {data_type.value} not implemented')
    def experiment(self, _experiment_args, _length, _dtype, _device):
        raise NotImplementedError()
    def name(self, _experiment_args) -> str:
        raise NotImplementedError()
    def mop(self, _experiment_args) -> float:
        raise NotImplementedError()
--- a/src/tf_2/div.py
+++ b/src/tf_2/div.py
@ -0,0 +1,34 @@
 from pathlib import Path
 import tensorflow as tf
 from src.common import DataType, Op
 from src.tf_2.base import TFBase
 class TFDivBench(TFBase):
    def __init__(self, output_path: Path):
        super().__init__(output_path, Op.DIV)
    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: tf.DType, device: tf.device):
        shape_1 = experiment_args
        with device:
            tensor_1 = tf.ones(shape_1, dtype=dtype)
            tensor_2 = tf.ones(shape_1, dtype=dtype)
            for _ in range(length):
                _ = tensor_1 / tensor_2
    def name(self, experiment_args: tuple[int, int]) -> str:
        shape_1 = experiment_args
        return f'{shape_1[0]}x{shape_1[1]} / {shape_1[0]}x{shape_1[1]}'
    def mop(self, experiment_args: tuple[int, int]) -> float:
        shape_1 = experiment_args
        return shape_1[0] * shape_1[1] / 1000_000
    def run(self,
            experiment_args: list[tuple[int, int]],
            experiment_count: int,
            data_type: DataType):
        super().run(experiment_args, experiment_count, data_type)
--- a/src/tf_2/matmul.py
+++ b/src/tf_2/matmul.py
@ -0,0 +1,34 @@
 from pathlib import Path
 import tensorflow as tf
 from src.common import DataType, Op
 from src.tf_2.base import TFBase
 class TFMatmulBench(TFBase):
    def __init__(self, output_path: Path):
        super().__init__(output_path, Op.MATMUL)
    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: tf.DType, device: tf.device):
        shape_1, shape_2 = experiment_args
        with device:
            tensor_1 = tf.ones(shape_1, dtype=dtype)
            tensor_2 = tf.ones(shape_2, dtype=dtype)
            for _ in range(length):
                _ = tensor_1 @ tensor_2
    def name(self, experiment_args: tuple[int, int]) -> str:
        shape_1, shape_2 = experiment_args
        return f'{shape_1[0]}x{shape_1[1]} @ {shape_2[0]}x{shape_2[1]}'
    def mop(self, experiment_args: tuple[int, int]) -> float:
        shape_1, shape_2 = experiment_args
        return (shape_1[0] * shape_2[1] / 1000_000) * 2 * (shape_1[1] - 1)
    def run(self,
            experiment_args: list[tuple[tuple[int, int], tuple[int, int]]],
            experiment_count: int,
            data_type: DataType):
        super().run(experiment_args, experiment_count, data_type)
--- a/src/tf_2/mul.py
+++ b/src/tf_2/mul.py
@ -0,0 +1,34 @@
 from pathlib import Path
 import tensorflow as tf
 from src.common import DataType, Op
 from src.tf_2.base import TFBase
 class TFMulBench(TFBase):
    def __init__(self, output_path: Path):
        super().__init__(output_path, Op.MUL)
    def experiment(self, experiment_args: tuple[int, int], length: int, dtype: tf.DType, device: tf.device):
        shape_1 = experiment_args
        with device:
            tensor_1 = tf.ones(shape_1, dtype=dtype)
            tensor_2 = tf.ones(shape_1, dtype=dtype)
            for _ in range(length):
                _ = tensor_1 * tensor_2
    def name(self, experiment_args: tuple[int, int]) -> str:
        shape_1 = experiment_args
        return f'{shape_1[0]}x{shape_1[1]} * {shape_1[0]}x{shape_1[1]}'
    def mop(self, experiment_args: tuple[int, int]) -> float:
        shape_1 = experiment_args
        return shape_1[0] * shape_1[1] / 1000_000
    def run(self,
            experiment_args: list[tuple[int, int]],
            experiment_count: int,
            data_type: DataType):
        super().run(experiment_args, experiment_count, data_type)
--- a/src/tf_2/ops.py
+++ b/src/tf_2/ops.py
@ -0,0 +1,16 @@
 from typing import Type
 from src.common import Op
 from src.tf_2.add import TFAddBench
 from src.tf_2.base import TFBase
 from src.tf_2.div import TFDivBench
 from src.tf_2.mul import TFMulBench
 from src.tf_2.matmul import TFMatmulBench
 tf2_ops: dict[Op, Type[TFBase]] = {
    Op.ADD: TFAddBench,
    Op.MUL: TFMulBench,
    Op.DIV: TFDivBench,
    Op.MATMUL: TFMatmulBench
 }
--- a/src/torch/base.py
+++ b/src/torch/base.py
@ -1,23 +0,0 @@
 from pathlib import Path
 import torch
 from src.base import Base, Device
 from src.utils import get_cpu_name, get_nvidia_name
 class TorchBase(Base):
    def __init__(self, output_path: Path):
        super().__init__(output_path)
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        if torch.cuda.is_available():
            if torch.cuda.device_count() > 1:
                print('WARINING : no multiple CUDA device benchmark implemented yet (only using first)')
            self.set_output_path(Device.GPU, get_nvidia_name())
            torch.backends.cudnn.benchmark = True
        else:
            self.set_output_path(Device.CPU, get_cpu_name())
        if not self.output_path.exists():
            self.output_path.mkdir(parents=True)
--- a/src/torch/matmul.py
+++ b/src/torch/matmul.py
@ -1,112 +0,0 @@
 import time
 from src.base import DataType
 from src.torch.base import TorchBase
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 import torch
 class TorchMatmulBench(TorchBase):
    def run(self,
            experiment_args: list[tuple[tuple[int, int], tuple[int, int]]],
            experiment_count: int,
            data_type: DataType):
        sns.set_theme(style="ticks")
        dtype = None
        if data_type == DataType.FLOAT16:
            dtype = torch.float16
        elif data_type == DataType.FLOAT32:
            dtype = torch.float32
        elif data_type == DataType.FLOAT64:
            dtype = torch.float64
        else:
            raise RuntimeError(f'data_type {data_type.value} not implemented')
        print(f'Startin Torch Matmul Benchmark with data type: {data_type.value}')
        experiment_names = []
        experiment_lengths = []
        experiment_times = []
        experiment_mop = []
        for shape_1, shape_2 in experiment_args:
            tensor_1 = torch.ones(shape_1, dtype=dtype, device=self.device)
            tensor_2 = torch.ones(shape_2, dtype=dtype, device=self.device) / (shape_2[1] - 1.0)
            # warmup
            for _ in range(20):
                _ = tensor_1 @ tensor_2
            # speed evalutaion
            counter = 0
            start_time = time.time()
            while(time.time() - start_time < 0.2):
                _ = tensor_1 @ tensor_2
                counter += 1
            end_time = time.time()
            target_time = 0.5 / experiment_count  # in s
            experiment_speed = counter / (end_time - start_time)  # in op/s
            experiment_length = max(int(target_time * experiment_speed), 2)
            run_times = []
            for _ in range(experiment_count):
                start_time = time.time()
                for _ in range(experiment_length):
                    _ = tensor_1 @ tensor_2
                run_times.append(time.time() - start_time)
            experiment_times += run_times
            experiment_names += [f'{shape_1[0]}x{shape_1[1]} @ {shape_2[0]}x{shape_2[1]}'] * experiment_count
            experiment_lengths += [experiment_length] * experiment_count
            experiment_mop += [(shape_1[0] * shape_2[1] / 1000_000) * 2 * (shape_1[1] - 1)] * experiment_count
            print(f'Run {experiment_names[-1]} (x{experiment_length})'
                  f' in {experiment_times[-1] * 1000:0.1f}ms')
        data = pd.DataFrame(
            {
                'run times (s)': experiment_times,
                'count': experiment_lengths,
                'ms/matmul': [(1000.0 * t) / l for t, l in zip(experiment_times, experiment_lengths)],
                'Mop/matmul': experiment_mop,
                'GFLOPS': [(mop * l) / (t * 1000.0)
                           for mop, l, t in zip(experiment_mop, experiment_lengths, experiment_times)]
            },
            index=pd.Index(experiment_names, name='experiment'))
        data.to_csv(self.output_path / f'matmul_{data_type.value}.csv', sep='\t')
        mean_data = data[['ms/matmul', 'GFLOPS']].groupby(data.index, sort=False).mean()
        max_data = data[['Mop/matmul']].groupby(data.index, sort=False).max()
        figure, axes = plt.subplots(nrows=3, sharex=True, figsize=(18, 12))
        figure.suptitle(f'Torch Matmul ({data_type.value})', fontsize=16)
        for axe in axes[:-1]:
            axe.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
        chart = sns.barplot(x=max_data.index, y='Mop/matmul', data=max_data, ax=axes[0], order=data.index.unique())
        axes[0].set_yscale("log")
        for p, value in zip(chart.patches, max_data['Mop/matmul']):
            chart.annotate(f'{value:0.3f}',
                           (p.get_x() + p.get_width() / 2.0, p.get_height()),
                           ha='center', va='center', fontsize=10, color='black', xytext=(0, 5),
                           textcoords='offset points')
        chart = sns.barplot(x=data.index, y='ms/matmul', data=data, ax=axes[1])
        for p, value in zip(chart.patches, mean_data['ms/matmul']):
            chart.annotate(f'{value:.3f}',
                           (p.get_x() + p.get_width() / 2.0, p.get_height()),
                           ha='center', va='center', fontsize=10, color='black', xytext=(0, 5),
                           textcoords='offset points')
        chart = sns.barplot(x=data.index, y='GFLOPS', data=data, ax=axes[2])
        for p, value in zip(chart.patches, mean_data['GFLOPS']):
            chart.annotate(f'{value:.3f}',
                           (p.get_x() + p.get_width() / 2.0, p.get_height()),
                           ha='center', va='center', fontsize=10, color='black', xytext=(0, 5),
                           textcoords='offset points')
        plt.xticks(rotation=20)
        plt.subplots_adjust(hspace=0.0, wspace=0.02, top=0.93, right=0.99, bottom=0.1, left=0.05)
        plt.savefig(self.output_path / f'matmul_{data_type.value}.png')