Initial commit

2021-09-28 00:41:53 +09:00 · 2021-09-28 00:41:53 +09:00 · fbf6898dd9
commit fbf6898dd9
6 changed files with 220 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+*.pyc
+*.temp
+
+output
--- a/benchmark.py
+++ b/benchmark.py
@ -0,0 +1,41 @@
+from argparse import ArgumentParser
+from pathlib import Path
+
+from src.base import DataType
+from src.torch.matmul import TorchMatmulBench
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument('--output', type=Path, default=Path('output'), help='Path to output files')
+    arguments = parser.parse_args()
+
+    output_path: Path = arguments.output
+
+    if not output_path.exists():
+        output_path.mkdir(parents=True)
+
+    for data_type in DataType:
+        TorchMatmulBench(output_path).run(
+            [
+                ((100, 100), (100, 100)),
+                ((100, 200), (200, 100)),
+                ((128, 128), (128, 128)),
+                ((200, 100), (100, 200)),
+                ((200, 200), (200, 200)),
+                ((256, 256), (256, 256)),
+                ((256, 512), (512, 256)),
+                ((400, 400), (400, 400)),
+                ((512, 256), (256, 512)),
+                ((512, 512), (512, 512)),
+                ((800, 800), (800, 800)),
+                ((1000, 1000), (1000, 1000)),
+                ((1200, 1200), (1200, 1200)),
+            ],
+            12,
+            data_type)
+    print('Benchmark done')
+
+
+if __name__ == '__main__':
+    main()
--- a/src/base.py
+++ b/src/base.py
@ -0,0 +1,22 @@
+from pathlib import Path
+from enum import Enum
+
+
+class Device(Enum):
+    CPU = 'cpu'
+    GPU = 'gpu'
+
+
+class DataType(Enum):
+    FLOAT16 = 'float16'
+    FLOAT32 = 'float32'
+    FLOAT64 = 'float64'
+
+
+class Base():
+    def __init__(self, output_path: Path):
+        self._base_output_path = output_path
+        self.output_path = output_path
+
+    def set_output_path(self, device: Device, device_name: str):
+        self.output_path = self._base_output_path / f'{device.value}_{device_name}'
--- a/src/torch/base.py
+++ b/src/torch/base.py
@ -0,0 +1,23 @@
+from pathlib import Path
+
+import torch
+
+from src.base import Base, Device
+from src.utils import get_cpu_name, get_nvidia_name
+
+
+class TorchBase(Base):
+    def __init__(self, output_path: Path):
+        super().__init__(output_path)
+
+        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+        if torch.cuda.is_available():
+            if torch.cuda.device_count() > 1:
+                print('WARINING : no multiple CUDA device benchmark implemented yet (only using first)')
+            self.set_output_path(Device.GPU, get_nvidia_name())
+            torch.backends.cudnn.benchmark = True
+        else:
+            self.set_output_path(Device.CPU, get_cpu_name())
+
+        if not self.output_path.exists():
+            self.output_path.mkdir(parents=True)
--- a/src/torch/matmul.py
+++ b/src/torch/matmul.py
@ -0,0 +1,112 @@
+import time
+
+from src.base import DataType
+from src.torch.base import TorchBase
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+import torch
+
+
+class TorchMatmulBench(TorchBase):
+
+    def run(self,
+            experiment_args: list[tuple[tuple[int, int], tuple[int, int]]],
+            experiment_count: int,
+            data_type: DataType):
+        sns.set_theme(style="ticks")
+
+        dtype = None
+        if data_type == DataType.FLOAT16:
+            dtype = torch.float16
+        elif data_type == DataType.FLOAT32:
+            dtype = torch.float32
+        elif data_type == DataType.FLOAT64:
+            dtype = torch.float64
+        else:
+            raise RuntimeError(f'data_type {data_type.value} not implemented')
+        print(f'Startin Torch Matmul Benchmark with data type: {data_type.value}')
+
+        experiment_names = []
+        experiment_lengths = []
+        experiment_times = []
+        experiment_mop = []
+        for shape_1, shape_2 in experiment_args:
+            tensor_1 = torch.ones(shape_1, dtype=dtype, device=self.device)
+            tensor_2 = torch.ones(shape_2, dtype=dtype, device=self.device) / (shape_2[1] - 1.0)
+
+            # warmup
+            for _ in range(20):
+                _ = tensor_1 @ tensor_2
+
+            # speed evalutaion
+            counter = 0
+            start_time = time.time()
+            while(time.time() - start_time < 0.2):
+                _ = tensor_1 @ tensor_2
+                counter += 1
+            end_time = time.time()
+
+            target_time = 0.5 / experiment_count  # in s
+            experiment_speed = counter / (end_time - start_time)  # in op/s
+            experiment_length = max(int(target_time * experiment_speed), 2)
+
+            run_times = []
+            for _ in range(experiment_count):
+                start_time = time.time()
+                for _ in range(experiment_length):
+                    _ = tensor_1 @ tensor_2
+                run_times.append(time.time() - start_time)
+            experiment_times += run_times
+            experiment_names += [f'{shape_1[0]}x{shape_1[1]} @ {shape_2[0]}x{shape_2[1]}'] * experiment_count
+            experiment_lengths += [experiment_length] * experiment_count
+            experiment_mop += [(shape_1[0] * shape_2[1] / 1000_000) * 2 * (shape_1[1] - 1)] * experiment_count
+            print(f'Run {experiment_names[-1]} (x{experiment_length})'
+                  f' in {experiment_times[-1] * 1000:0.1f}ms')
+
+        data = pd.DataFrame(
+            {
+                'run times (s)': experiment_times,
+                'count': experiment_lengths,
+                'ms/matmul': [(1000.0 * t) / l for t, l in zip(experiment_times, experiment_lengths)],
+                'Mop/matmul': experiment_mop,
+                'GFLOPS': [(mop * l) / (t * 1000.0)
+                           for mop, l, t in zip(experiment_mop, experiment_lengths, experiment_times)]
+            },
+            index=pd.Index(experiment_names, name='experiment'))
+        data.to_csv(self.output_path / f'matmul_{data_type.value}.csv', sep='\t')
+
+        mean_data = data[['ms/matmul', 'GFLOPS']].groupby(data.index, sort=False).mean()
+        max_data = data[['Mop/matmul']].groupby(data.index, sort=False).max()
+
+        figure, axes = plt.subplots(nrows=3, sharex=True, figsize=(18, 12))
+        figure.suptitle(f'Torch Matmul ({data_type.value})', fontsize=16)
+        for axe in axes[:-1]:
+            axe.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
+
+        chart = sns.barplot(x=max_data.index, y='Mop/matmul', data=max_data, ax=axes[0], order=data.index.unique())
+        axes[0].set_yscale("log")
+        for p, value in zip(chart.patches, max_data['Mop/matmul']):
+            chart.annotate(f'{value:0.3f}',
+                           (p.get_x() + p.get_width() / 2.0, p.get_height()),
+                           ha='center', va='center', fontsize=10, color='black', xytext=(0, 5),
+                           textcoords='offset points')
+
+        chart = sns.barplot(x=data.index, y='ms/matmul', data=data, ax=axes[1])
+        for p, value in zip(chart.patches, mean_data['ms/matmul']):
+            chart.annotate(f'{value:.3f}',
+                           (p.get_x() + p.get_width() / 2.0, p.get_height()),
+                           ha='center', va='center', fontsize=10, color='black', xytext=(0, 5),
+                           textcoords='offset points')
+
+        chart = sns.barplot(x=data.index, y='GFLOPS', data=data, ax=axes[2])
+        for p, value in zip(chart.patches, mean_data['GFLOPS']):
+            chart.annotate(f'{value:.3f}',
+                           (p.get_x() + p.get_width() / 2.0, p.get_height()),
+                           ha='center', va='center', fontsize=10, color='black', xytext=(0, 5),
+                           textcoords='offset points')
+
+        plt.xticks(rotation=20)
+        plt.subplots_adjust(hspace=0.0, wspace=0.02, top=0.93, right=0.99, bottom=0.1, left=0.05)
+        plt.savefig(self.output_path / f'matmul_{data_type.value}.png')
--- a/src/utils.py
+++ b/src/utils.py
@ -0,0 +1,18 @@
+import subprocess
+
+
+def get_cpu_name() -> str:
+    raw_out = subprocess.check_output(['lscpu']).decode()
+    architecture = 'unkown'
+    model = 'noname'
+    for out_line in raw_out.split('\n'):
+        line_info = out_line.strip().split(':')
+        if line_info[0].strip() == 'Architecture':
+            architecture = line_info[1].strip()
+        if line_info[0].strip() == 'Model name':
+            model = line_info[1].strip()
+    return f'{architecture}_{model}'
+
+
+def get_nvidia_name() -> str:
+    return subprocess.check_output(['nvidia-smi', '--format=csv,noheader', '--query-gpu=name']).decode().strip()