02-05-2020, 01:48 PM
Summary:
- A stress test of SGEMM (matrix multiplication) using OpenCL will slow down after 150 iterationbs.
- Details:
- Test is running fine during the first 100 iterations (multiply two matrix of 512 x 512). It will slow down to 1/2 speed after about 150 iterations, and will slow down farther to about 1/4 speed after 200 iterations, etc.
- Note: test is running fine if sleeping for 20 seconds after every 100 iterations
- Test is running fine during the first 100 iterations (multiply two matrix of 512 x 512). It will slow down to 1/2 speed after about 150 iterations, and will slow down farther to about 1/4 speed after 200 iterations, etc.
- Details:
- Do the Pinebook pro GPU has a hardware problem (ex: over heating) that prevent running a long OpenCL stress test?
- Pinebook pro setup highlights:
- Ubuntu 18.04 64bit (based on the root file system from: https://github.com/ayufan-rock64/linux-b...m64.img.xz)
- Compiler: gcc 8.3
- OpenCL 1.2 (wget --timestamp https://github.com/rockchip-linux/rk-roo..._arm64.deb)
- ARM Compute Library 19.11.1 (native compile of sources from https://github.com/arm-software/ComputeLibrary)
- Ubuntu 18.04 64bit (based on the root file system from: https://github.com/ayufan-rock64/linux-b...m64.img.xz)
- [size=undefined]OpenCL is verified with:[/size]
- clinfo
- clpeak (from https://github.com/krrishnarraj/clpeak)
- clinfo
- [size=undefined]Stress test:[/size]
- SGEMM (matrix multiplication) using OpenCL of the ARM Compute Library 19.11.1 - the test code is appended below
- SGEMM (matrix multiplication) using OpenCL of the ARM Compute Library 19.11.1 - the test code is appended below
Code:
// Example code is based on:
// * https://github.com/ctuning/ck-math/tree/master/program/acl-sgemm-opencl-example
//
// #ifndef ARM_COMPUTE_CL /* Needed by Utils.cpp to handle OpenCL exceptions properly */
// #error "This example needs to be built with -DARM_COMPUTE_CL"
// #endif /* ARM_COMPUTE_CL */
#include <arm_compute/core/Types.h>
#include <arm_compute/core/Helpers.h>
#include <arm_compute/core/ITensor.h>
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include <arm_compute/runtime/Tensor.h>
#include "arm_compute/runtime/CL/CLFunctions.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <cctype>
#include <cerrno>
#include <iomanip>
#include <string>
#include <sys/time.h>
using namespace arm_compute;
// (MM, MN, MK) can be override from compiler options
#ifndef MM
#define MM 512
#endif
#ifndef MN
#define MN 512
#endif
#ifndef MK
#define MK 512
#endif
int main(void)
{
long num_repetitions=1;
struct timeval before, after;
unsigned int m=MM;
unsigned int n=MN;
unsigned int k=MK;
if (getenv("NUM_REPETITIONS")!=NULL) num_repetitions=atol(getenv("NUM_REPETITIONS"));
if (getenv("MM")!=NULL) m=atol(getenv("MM"));
if (getenv("MN")!=NULL) n=atol(getenv("MN"));
if (getenv("MK")!=NULL) k=atol(getenv("MK"));
printf("NUM_REPETITIONS = %lu\n", num_repetitions);
printf("m = %u\n", m);
printf("n = %u\n", n);
printf("k = %u\n", k);
TensorShape AShape(k,m);
TensorShape BShape(n,k);
TensorShape OShape(n,m);
CLTensor ATensor;
CLTensor BTensor;
CLTensor OTensor;
CLScheduler::get().default_init();
ATensor.allocator()->init(TensorInfo(AShape,Format::F32));
BTensor.allocator()->init(TensorInfo(BShape,Format::F32));
OTensor.allocator()->init(TensorInfo(OShape,Format::F32));
CLGEMM gemm;
gemm.configure(&ATensor, &BTensor, NULL, &OTensor, 2.0f, 2.0f);
ATensor.allocator()->allocate();
BTensor.allocator()->allocate();
OTensor.allocator()->allocate();
gettimeofday(&before, NULL);
for(int r = 0; r < num_repetitions; ++r) {
gemm.run();
CLScheduler::get().sync();
}
gettimeofday(&after, NULL);
double total_time = (after.tv_sec - before.tv_sec) + 1e-6*(after.tv_usec - before.tv_usec);
double flops = 2.0*m*n*k;
double gflops = 1e-9 * flops;
double avg_time = total_time / num_repetitions;
double avg_gflops_per_sec = gflops / avg_time;
printf("M = %u\nN = %u\nK = %u\n", m, n, k);
printf("TIME_TOTAL = %lf\n", total_time);
printf("TIME_AVG = %lf\n", avg_time);
printf("GFLOPS_AVG = %lf\n", avg_gflops_per_sec);
printf("------------- CLBLAST-STYLE_OUTPUT\n");
printf("m = %u\nn = %u\nk = %u\n", m, n, k);
printf("ms_1 = %lf\n", avg_time*1000);
printf("GFLOPS_1 = %lf\n", avg_gflops_per_sec);
return 0;
}