Merge pull request #1 from daisytuner/initial-examples

lukastruemper · web-flow · commit beec70fb5b04 · 2025-04-29T11:10:18.000+02:00
adds initial examples
diff --git a/.daisy/c_docc.yml b/.daisy/c_docc.yml
@@ -0,0 +1,25 @@
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, reopened, synchronize, ready_for_review]
+
+parameters:
+  timeout: 20
+  partitions:
+    - bellis5
+
+steps:
+  build: |
+    docc -Xclang -no-opaque-pointers -g -O1 -mllvm -hotspot -o c/matmul.out c/matmul.c -ldaisy_rtl
+
+  run:
+    matmul:
+      command: ./c/matmul.out
+      measurements: 5
+      profiler: perf
+      loops: true
+      metrics:
+        - flops_dp
+        - memory_bandwidth
diff --git a/.daisy/c_gcc.yml b/.daisy/c_gcc.yml
@@ -0,0 +1,24 @@
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, reopened, synchronize, ready_for_review]
+
+parameters:
+  timeout: 20
+  partitions:
+    - bellis5
+
+steps:
+  build: |
+    gcc -g -O1 -o c/matmul.out c/matmul.c
+
+  run:
+    matmul:
+      command: ./c/matmul.out
+      measurements: 5
+      profiler: perf
+      metrics:
+        - flops_dp
+        - memory_bandwidth
diff --git a/.daisy/cuda.yml b/.daisy/cuda.yml
@@ -0,0 +1,22 @@
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, reopened, synchronize, ready_for_review]
+
+parameters:
+  timeout: 20
+  partitions:
+    - zinnia
+
+steps:
+  build: |  
+    nvcc -O1 -g -o cuda/matmul.out cuda/matmul.cu
+
+  run:
+    matmul_cu:
+      command: ./cuda/matmul.out
+      measurements: 5
+      profiler: nsys
+      kernels: true
diff --git a/.daisy/onnx.yml b/.daisy/onnx.yml
@@ -0,0 +1,19 @@
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, reopened, synchronize, ready_for_review]
+
+parameters:
+  timeout: 20
+  partitions:
+    - tansy
+
+steps:
+  build: |
+
+  run:
+    squeezenet:
+      model: /data/squeezenet1.onnx
+      measurements: 1
diff --git a/.daisy/python_apt.yml b/.daisy/python_apt.yml
@@ -0,0 +1,21 @@
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, reopened, synchronize, ready_for_review]
+
+parameters:
+  timeout: 20
+  partitions:
+    - bellis5
+
+steps:
+  build: |
+    sudo apt-get install -y python3-numpy
+
+  run:
+    matmul:
+      command: python3 python/matmul_np.py
+      measurements: 5
+      profiler: py-spy
diff --git a/.daisy/python_conda.yml b/.daisy/python_conda.yml
@@ -0,0 +1,22 @@
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, reopened, synchronize, ready_for_review]
+
+parameters:
+  timeout: 20
+  conda: "3.12"
+  partitions:
+    - bellis5
+
+steps:
+  build: |
+    pip install numpy
+
+  run:
+    matmul:
+      command: python python/matmul_np.py
+      measurements: 5
+      profiler: perf
diff --git a/.daisy/python_venv.yml b/.daisy/python_venv.yml
@@ -0,0 +1,24 @@
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, reopened, synchronize, ready_for_review]
+
+parameters:
+  timeout: 20
+  partitions:
+    - bellis5
+
+steps:
+  build: |
+    python3 -m venv venv
+    . venv/bin/activate
+
+    pip install numpy
+
+  run:
+    matmul:
+      command: venv/bin/python3 python/matmul_np.py
+      measurements: 5
+      profiler: py-spy
diff --git a/.daisy/tflite_coral.yml b/.daisy/tflite_coral.yml
@@ -0,0 +1,41 @@
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, reopened, synchronize, ready_for_review]
+
+parameters:
+  timeout: 60
+  conda: "3.8"
+  partitions:
+    - bellis4
+
+steps:
+  build: |
+    sudo add-apt-repository 'deb https://packages.cloud.google.com/apt coral-edgetpu-stable main'
+    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
+    sudo apt-get update
+    sudo apt-get install -y libedgetpu1-std
+    sudo apt-get install -y usbutils
+
+
+    sudo apt install -y libjpeg-dev zlib1g-dev
+    
+    wget https://github.com/google-coral/pycoral/releases/download/v2.0.0/tflite_runtime-2.5.0.post1-cp38-cp38-linux_aarch64.whl
+    wget https://github.com/google-coral/pycoral/releases/download/v2.0.0/pycoral-2.0.0-cp38-cp38-linux_aarch64.whl
+    
+    pip install Pillow==9.5.0
+    pip install tflite_runtime-2.5.0.post1-cp38-cp38-linux_aarch64.whl
+    pip install pycoral-2.0.0-cp38-cp38-linux_aarch64.whl
+    
+    git clone --recurse-submodules --branch v2.0.0 --depth 1 https://github.com/google-coral/pycoral
+    cd pycoral
+
+    bash examples/install_requirements.sh classify_image.py
+
+  run:
+    list_usb:
+      command: lsusb | grep Google
+    classify_image:
+      command: python pycoral/examples/classify_image.py --model pycoral/test_data/mobilenet_v2_1.0_224_inat_bird_quant_edgetpu.tflite --labels pycoral/test_data/inat_bird_labels.txt --input pycoral/test_data/parrot.jpg
diff --git a/c/matmul.c b/c/matmul.c
@@ -0,0 +1,47 @@
+#include <stdio.h>
+
+#define N 64
+#define M 64
+#define K 64
+
+int main() {
+    int A[N][K];
+    int B[K][M];
+    int C[N][M];
+    
+    // Init
+    for (int i = 0; i < N; i++) {
+        for (int j = 0; j < K; j++) {
+            A[i][j] = i + j;
+        }
+    }
+    for (int i = 0; i < K; i++) {
+        for (int j = 0; j < M; j++) {
+            B[i][j] = i + j;
+        }
+    }
+    for (int i = 0; i < N; i++) {
+        for (int j = 0; j < M; j++) {
+            C[i][j] = 0;
+        }
+    }
+
+    // Matmul
+    for (int i = 0; i < N; i++) {
+        for (int j = 0; j < M; j++) {
+            for (int k = 0; k < K; k++) {
+                C[i][j] += A[i][k] * B[k][j];
+            }
+        }
+    }
+    
+    // Print
+    for (int i = 0; i < N; i++) {
+        for (int j = 0; j < M; j++) {
+            printf("%d ", C[i][j]);
+        }
+        printf("\n");
+    }
+
+    return 0;
+}
diff --git a/cuda/matmul.cu b/cuda/matmul.cu
@@ -0,0 +1,101 @@
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define TILE_SIZE 16
+
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
+    if (code != cudaSuccess) {
+        fprintf(stderr, "CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+
+__global__ void matrixMulKernel(float *A, float *B, float *C, int N) {
+    __shared__ float sharedA[TILE_SIZE][TILE_SIZE];
+    __shared__ float sharedB[TILE_SIZE][TILE_SIZE];
+
+    int tx = threadIdx.x, ty = threadIdx.y;
+    int row = blockIdx.y * TILE_SIZE + ty;
+    int col = blockIdx.x * TILE_SIZE + tx;
+    float sum = 0.0f;
+
+    for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; t++) {
+        if (row < N && t * TILE_SIZE + tx < N)
+            sharedA[ty][tx] = A[row * N + t * TILE_SIZE + tx];
+        else
+            sharedA[ty][tx] = 0.0f;
+
+        if (col < N && t * TILE_SIZE + ty < N)
+            sharedB[ty][tx] = B[(t * TILE_SIZE + ty) * N + col];
+        else
+            sharedB[ty][tx] = 0.0f;
+
+        __syncthreads();
+
+        for (int k = 0; k < TILE_SIZE; k++) {
+            sum += sharedA[ty][k] * sharedB[k][tx];
+        }
+
+        __syncthreads();
+    }
+
+    if (row < N && col < N)
+        C[row * N + col] = sum;
+}
+
+void matrixMultiply(float *h_A, float *h_B, float *h_C, int N) {
+    float *d_A, *d_B, *d_C;
+    size_t size = N * N * sizeof(float);
+
+    cudaMalloc(&d_A, size);
+    cudaMalloc(&d_B, size);
+    cudaMalloc(&d_C, size);
+
+    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
+
+    dim3 blockSize(TILE_SIZE, TILE_SIZE);
+    dim3 gridSize((N + TILE_SIZE - 1) / TILE_SIZE, (N + TILE_SIZE - 1) / TILE_SIZE);
+
+    matrixMulKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);
+
+    cudaDeviceSynchronize();
+    gpuAssert(cudaGetLastError(), __FILE__, __LINE__);
+
+    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
+
+    cudaFree(d_A);
+    cudaFree(d_B);
+    cudaFree(d_C);
+}
+
+int main() {
+    int N = 64; // Matrix size
+    size_t size = N * N * sizeof(float);
+
+    float *h_A = (float *)malloc(size);
+    float *h_B = (float *)malloc(size);
+    float *h_C = (float *)malloc(size);
+
+    for (int i = 0; i < N * N; i++) {
+        h_A[i] = rand() % 10;
+        h_B[i] = rand() % 10;
+    }
+
+    matrixMultiply(h_A, h_B, h_C, N);
+
+    printf("Result matrix:\n");
+    for (int i = 0; i < N; i++) {
+        for (int j = 0; j < N; j++) {
+            printf("%0.1f ", h_C[i * N + j]);
+        }
+        printf("\n");
+    }
+
+    free(h_A);
+    free(h_B);
+    free(h_C);
+
+    return 0;
+}
diff --git a/python/matmul_np.py b/python/matmul_np.py
@@ -0,0 +1,11 @@
+import numpy as np
+
+def matmul(A, B):
+    return np.dot(A, B)
+
+if __name__ == '__main__':
+    A = np.random.rand(64, 64)
+    B = np.random.rand(64, 64)
+
+    C = matmul(A, B)
+    print(C)