libflame revision_anchor
Functions
FLASH_Queue_gpu.h File Reference

(r)

Go to the source code of this file.

Functions

void FLASH_Queue_init_gpu (void)
 
void FLASH_Queue_finalize_gpu (void)
 
FLA_Error FLASH_Queue_enable_gpu (void)
 
FLA_Error FLASH_Queue_disable_gpu (void)
 
FLA_Bool FLASH_Queue_get_enabled_gpu (void)
 
void FLASH_Queue_set_gpu_num_blocks (dim_t n_blocks)
 
dim_t FLASH_Queue_get_gpu_num_blocks (void)
 
FLA_Error FLASH_Queue_bind_gpu (int thread)
 
FLA_Error FLASH_Queue_alloc_gpu (dim_t size, FLA_Datatype datatype, void **buffer_gpu)
 
FLA_Error FLASH_Queue_free_gpu (void *buffer_gpu)
 
FLA_Error FLASH_Queue_write_gpu (FLA_Obj obj, void *buffer_gpu)
 
FLA_Error FLASH_Queue_read_gpu (FLA_Obj obj, void *buffer_gpu)
 
void FLASH_Queue_exec_task_gpu (FLASH_Task *t, void **input_arg, void **output_arg)
 

Function Documentation

◆ FLASH_Queue_alloc_gpu()

FLA_Error FLASH_Queue_alloc_gpu ( dim_t  size,
FLA_Datatype  datatype,
void **  buffer_gpu 
)
155{
157
158 // Allocate memory for a block on GPU.
159 status = cublasAlloc( size,
160 FLA_Obj_datatype_size( datatype ),
161 buffer_gpu );
162
163 // Check to see if the allocation was successful.
166
167 return FLA_SUCCESS;
168}
dim_t FLA_Obj_datatype_size(FLA_Datatype datatype)
Definition FLA_Query.c:61
int i
Definition bl1_axmyv2.c:145

References FLA_Obj_datatype_size().

Referenced by FLASH_Queue_create_gpu().

◆ FLASH_Queue_bind_gpu()

FLA_Error FLASH_Queue_bind_gpu ( int  thread)
139{
140 // Bind a GPU to this thread.
141 cudaSetDevice( thread );
142
143 return FLA_SUCCESS;
144}

Referenced by FLASH_Queue_create_gpu().

◆ FLASH_Queue_disable_gpu()

FLA_Error FLASH_Queue_disable_gpu ( void  )
76{
77 if ( FLASH_Queue_stack_depth() == 0 )
78 {
79 // Disable if not begin parallel region yet.
80 flash_queue_enabled_gpu = FALSE;
81 return FLA_SUCCESS;
82 }
83 else
84 {
85 // Cannot change status during parallel region.
86 return FLA_FAILURE;
87 }
88}
unsigned int FLASH_Queue_stack_depth(void)
Definition FLASH_Queue.c:106

References FLASH_Queue_stack_depth().

◆ FLASH_Queue_enable_gpu()

FLA_Error FLASH_Queue_enable_gpu ( void  )
55{
57 {
58 // Enable if not begin parallel region yet and SuperMatrix is enabled.
59 flash_queue_enabled_gpu = TRUE;
60 return FLA_SUCCESS;
61 }
62 else
63 {
64 // Cannot change status during parallel region.
65 return FLA_FAILURE;
66 }
67}
FLA_Bool FLASH_Queue_get_enabled(void)
Definition FLASH_Queue.c:171

References FLASH_Queue_get_enabled(), and FLASH_Queue_stack_depth().

◆ FLASH_Queue_exec_task_gpu()

void FLASH_Queue_exec_task_gpu ( FLASH_Task t,
void **  input_arg,
void **  output_arg 
)
233{
234 // Define local function pointer types.
235
236 // Level-3 BLAS
246
247 // Level-2 BLAS
250
251 // Level-1 BLAS
252 typedef FLA_Error(*flash_axpy_gpu_p)(FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu);
253 typedef FLA_Error(*flash_copy_gpu_p)(FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu);
256
257 // Only execute task if it is not NULL.
258 if ( t == NULL )
259 return;
260
261 // Now "switch" between the various possible task functions.
262
263 // FLA_Gemm
264 if ( t->func == (void *) FLA_Gemm_task )
265 {
266 flash_gemm_gpu_p func;
268
269 func( ( FLA_Trans ) t->int_arg[0],
270 ( FLA_Trans ) t->int_arg[1],
271 t->fla_arg[0],
272 t->input_arg[0],
273 input_arg[0],
274 t->input_arg[1],
275 input_arg[1],
276 t->fla_arg[1],
277 t->output_arg[0],
278 output_arg[0] );
279 }
280 // FLA_Hemm
281 else if ( t->func == (void *) FLA_Hemm_task )
282 {
283 flash_hemm_gpu_p func;
285
286 func( ( FLA_Side ) t->int_arg[0],
287 ( FLA_Uplo ) t->int_arg[1],
288 t->fla_arg[0],
289 t->input_arg[0],
290 input_arg[0],
291 t->input_arg[1],
292 input_arg[1],
293 t->fla_arg[1],
294 t->output_arg[0],
295 output_arg[0] );
296 }
297 // FLA_Herk
298 else if ( t->func == (void *) FLA_Herk_task )
299 {
300 flash_herk_gpu_p func;
302
303 func( ( FLA_Uplo ) t->int_arg[0],
304 ( FLA_Trans ) t->int_arg[1],
305 t->fla_arg[0],
306 t->input_arg[0],
307 input_arg[0],
308 t->fla_arg[1],
309 t->output_arg[0],
310 output_arg[0] );
311 }
312 // FLA_Her2k
313 else if ( t->func == (void *) FLA_Her2k_task )
314 {
317
318 func( ( FLA_Uplo ) t->int_arg[0],
319 ( FLA_Trans ) t->int_arg[1],
320 t->fla_arg[0],
321 t->input_arg[0],
322 input_arg[0],
323 t->input_arg[1],
324 input_arg[1],
325 t->fla_arg[1],
326 t->output_arg[0],
327 output_arg[0] );
328 }
329 // FLA_Symm
330 else if ( t->func == (void *) FLA_Symm_task )
331 {
332 flash_symm_gpu_p func;
334
335 func( ( FLA_Side ) t->int_arg[0],
336 ( FLA_Uplo ) t->int_arg[1],
337 t->fla_arg[0],
338 t->input_arg[0],
339 input_arg[0],
340 t->input_arg[1],
341 input_arg[1],
342 t->fla_arg[1],
343 t->output_arg[0],
344 output_arg[0] );
345 }
346 // FLA_Syrk
347 else if ( t->func == (void *) FLA_Syrk_task )
348 {
349 flash_syrk_gpu_p func;
351
352 func( ( FLA_Uplo ) t->int_arg[0],
353 ( FLA_Trans ) t->int_arg[1],
354 t->fla_arg[0],
355 t->input_arg[0],
356 input_arg[0],
357 t->fla_arg[1],
358 t->output_arg[0],
359 output_arg[0] );
360 }
361 // FLA_Syr2k
362 else if ( t->func == (void *) FLA_Syr2k_task )
363 {
366
367 func( ( FLA_Uplo ) t->int_arg[0],
368 ( FLA_Trans ) t->int_arg[1],
369 t->fla_arg[0],
370 t->input_arg[0],
371 input_arg[0],
372 t->input_arg[1],
373 input_arg[1],
374 t->fla_arg[1],
375 t->output_arg[0],
376 output_arg[0] );
377 }
378 // FLA_Trmm
379 else if ( t->func == (void *) FLA_Trmm_task )
380 {
381 flash_trmm_gpu_p func;
383
384 func( ( FLA_Side ) t->int_arg[0],
385 ( FLA_Uplo ) t->int_arg[1],
386 ( FLA_Trans ) t->int_arg[2],
387 ( FLA_Diag ) t->int_arg[3],
388 t->fla_arg[0],
389 t->input_arg[0],
390 input_arg[0],
391 t->output_arg[0],
392 output_arg[0] );
393 }
394 // FLA_Trsm
395 else if ( t->func == (void *) FLA_Trsm_task )
396 {
397 flash_trsm_gpu_p func;
399
400 func( ( FLA_Side ) t->int_arg[0],
401 ( FLA_Uplo ) t->int_arg[1],
402 ( FLA_Trans ) t->int_arg[2],
403 ( FLA_Diag ) t->int_arg[3],
404 t->fla_arg[0],
405 t->input_arg[0],
406 input_arg[0],
407 t->output_arg[0],
408 output_arg[0] );
409 }
410 // FLA_Gemv
411 else if ( t->func == (void *) FLA_Gemv_task )
412 {
413 flash_gemv_gpu_p func;
415
416 func( ( FLA_Trans ) t->int_arg[0],
417 t->fla_arg[0],
418 t->input_arg[0],
419 input_arg[0],
420 t->input_arg[1],
421 input_arg[1],
422 t->fla_arg[1],
423 t->output_arg[0],
424 output_arg[0] );
425 }
426 // FLA_Trsv
427 else if ( t->func == (void *) FLA_Trsv_task )
428 {
429 flash_trsv_gpu_p func;
431
432 func( ( FLA_Uplo ) t->int_arg[0],
433 ( FLA_Trans ) t->int_arg[1],
434 ( FLA_Diag ) t->int_arg[2],
435 t->input_arg[0],
436 input_arg[0],
437 t->output_arg[0],
438 output_arg[0] );
439 }
440 // FLA_Axpy
441 else if ( t->func == (void *) FLA_Axpy_task )
442 {
443 flash_axpy_gpu_p func;
445
446 func( t->fla_arg[0],
447 t->input_arg[0],
448 input_arg[0],
449 t->output_arg[0],
450 output_arg[0] );
451 }
452 // FLA_Copy
453 else if ( t->func == (void *) FLA_Copy_task )
454 {
455 flash_copy_gpu_p func;
457
458 func( t->input_arg[0],
459 input_arg[0],
460 t->output_arg[0],
461 output_arg[0] );
462 }
463 // FLA_Scal
464 else if ( t->func == (void *) FLA_Scal_task )
465 {
466 flash_scal_gpu_p func;
468
469 func( t->fla_arg[0],
470 t->output_arg[0],
471 output_arg[0] );
472 }
473 // FLA_Scalr
474 else if ( t->func == (void *) FLA_Scalr_task )
475 {
478
479 func( ( FLA_Uplo ) t->int_arg[0],
480 t->fla_arg[0],
481 t->output_arg[0],
482 output_arg[0] );
483 }
484 else
485 {
487 }
488
489 return;
490}
FLA_Error FLA_Scal_task(FLA_Obj alpha, FLA_Obj A, fla_scal_t *cntl)
Definition FLA_Scal_task.c:13
FLA_Error FLA_Scalr_task(FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, fla_scalr_t *cntl)
Definition FLA_Scalr_task.c:13
FLA_Error FLA_Copy_external_gpu(FLA_Obj A, void *A_gpu, FLA_Obj B, void *B_gpu)
Definition FLA_Copy_external_gpu.c:17
FLA_Error FLA_Copy_task(FLA_Obj A, FLA_Obj B, fla_copy_t *cntl)
Definition FLA_Copy_task.c:13
FLA_Error FLA_Scal_external_gpu(FLA_Obj alpha, FLA_Obj A, void *A_gpu)
Definition FLA_Scal_external_gpu.c:17
FLA_Error FLA_Axpy_task(FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpy_t *cntl)
Definition FLA_Axpy_task.c:13
FLA_Error FLA_Scalr_external_gpu(FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, void *A_gpu)
Definition FLA_Scalr_external_gpu.c:17
FLA_Error FLA_Axpy_external_gpu(FLA_Obj alpha, FLA_Obj A, void *A_gpu, FLA_Obj B, void *B_gpu)
Definition FLA_Axpy_external_gpu.c:17
FLA_Error FLA_Gemv_task(FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t *cntl)
Definition FLA_Gemv_task.c:13
FLA_Error FLA_Gemv_external_gpu(FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, void *A_gpu, FLA_Obj x, void *x_gpu, FLA_Obj beta, FLA_Obj y, void *y_gpu)
Definition FLA_Gemv_external_gpu.c:17
FLA_Error FLA_Trsv_task(FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t *cntl)
Definition FLA_Trsv_task.c:13
FLA_Error FLA_Trsv_external_gpu(FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj A, void *A_gpu, FLA_Obj x, void *x_gpu)
Definition FLA_Trsv_external_gpu.c:17
FLA_Error FLA_Syrk_task(FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t *cntl)
Definition FLA_Syrk_task.c:13
FLA_Error FLA_Gemm_external_gpu(FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, void *A_gpu, FLA_Obj B, void *B_gpu, FLA_Obj beta, FLA_Obj C, void *C_gpu)
Definition FLA_Gemm_external_gpu.c:17
FLA_Error FLA_Herk_task(FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t *cntl)
Definition FLA_Herk_task.c:13
FLA_Error FLA_Syr2k_external_gpu(FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, void *A_gpu, FLA_Obj B, void *B_gpu, FLA_Obj beta, FLA_Obj C, void *C_gpu)
Definition FLA_Syr2k_external_gpu.c:17
FLA_Error FLA_Trmm_task(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t *cntl)
Definition FLA_Trmm_task.c:13
FLA_Error FLA_Hemm_task(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t *cntl)
Definition FLA_Hemm_task.c:13
FLA_Error FLA_Trmm_external_gpu(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, void *A_gpu, FLA_Obj B, void *B_gpu)
Definition FLA_Trmm_external_gpu.c:17
FLA_Error FLA_Trsm_external_gpu(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, void *A_gpu, FLA_Obj B, void *B_gpu)
Definition FLA_Trsm_external_gpu.c:17
FLA_Error FLA_Syrk_external_gpu(FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, void *A_gpu, FLA_Obj beta, FLA_Obj C, void *C_gpu)
Definition FLA_Syrk_external_gpu.c:17
FLA_Error FLA_Her2k_task(FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t *cntl)
Definition FLA_Her2k_task.c:13
FLA_Error FLA_Her2k_external_gpu(FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, void *A_gpu, FLA_Obj B, void *B_gpu, FLA_Obj beta, FLA_Obj C, void *C_gpu)
Definition FLA_Her2k_external_gpu.c:17
FLA_Error FLA_Syr2k_task(FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t *cntl)
Definition FLA_Syr2k_task.c:13
FLA_Error FLA_Symm_task(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t *cntl)
Definition FLA_Symm_task.c:13
FLA_Error FLA_Symm_external_gpu(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, void *A_gpu, FLA_Obj B, void *B_gpu, FLA_Obj beta, FLA_Obj C, void *C_gpu)
Definition FLA_Symm_external_gpu.c:17
FLA_Error FLA_Trsm_task(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t *cntl)
Definition FLA_Trsm_task.c:13
FLA_Error FLA_Gemm_task(FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t *cntl)
Definition FLA_Gemm_task.c:13
FLA_Error FLA_Hemm_external_gpu(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, void *A_gpu, FLA_Obj B, void *B_gpu, FLA_Obj beta, FLA_Obj C, void *C_gpu)
Definition FLA_Hemm_external_gpu.c:17
FLA_Error FLA_Herk_external_gpu(FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, void *A_gpu, FLA_Obj beta, FLA_Obj C, void *C_gpu)
Definition FLA_Herk_external_gpu.c:17
int FLA_Error
Definition FLA_type_defs.h:47
int FLA_Side
Definition FLA_type_defs.h:51
int FLA_Trans
Definition FLA_type_defs.h:53
int FLA_Uplo
Definition FLA_type_defs.h:52
int FLA_Diag
Definition FLA_type_defs.h:55
Definition FLA_type_defs.h:159

References FLASH_Task_s::fla_arg, FLA_Axpy_external_gpu(), FLA_Axpy_task(), FLA_Copy_external_gpu(), FLA_Copy_task(), FLA_Gemm_external_gpu(), FLA_Gemm_task(), FLA_Gemv_external_gpu(), FLA_Gemv_task(), FLA_Hemm_external_gpu(), FLA_Hemm_task(), FLA_Her2k_external_gpu(), FLA_Her2k_task(), FLA_Herk_external_gpu(), FLA_Herk_task(), FLA_Scal_external_gpu(), FLA_Scal_task(), FLA_Scalr_external_gpu(), FLA_Scalr_task(), FLA_Symm_external_gpu(), FLA_Symm_task(), FLA_Syr2k_external_gpu(), FLA_Syr2k_task(), FLA_Syrk_external_gpu(), FLA_Syrk_task(), FLA_Trmm_external_gpu(), FLA_Trmm_task(), FLA_Trsm_external_gpu(), FLA_Trsm_task(), FLA_Trsv_external_gpu(), FLA_Trsv_task(), FLASH_Task_s::func, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, and FLASH_Task_s::output_arg.

Referenced by FLASH_Queue_exec_gpu().

◆ FLASH_Queue_finalize_gpu()

void FLASH_Queue_finalize_gpu ( void  )
42{
44
45 return;
46}

Referenced by FLASH_Queue_finalize().

◆ FLASH_Queue_free_gpu()

FLA_Error FLASH_Queue_free_gpu ( void buffer_gpu)
177{
178 // Free memory for a block on GPU.
179 cublasFree( buffer_gpu );
180
181 return FLA_SUCCESS;
182}

Referenced by FLASH_Queue_destroy_gpu().

◆ FLASH_Queue_get_enabled_gpu()

FLA_Bool FLASH_Queue_get_enabled_gpu ( void  )
97{
98 // Return if SuperMatrix is enabled, but always false if not.
100 return flash_queue_enabled_gpu;
101 else
102 return FALSE;
103}

References FLASH_Queue_get_enabled().

Referenced by FLASH_Queue_create_gpu(), FLASH_Queue_destroy_gpu(), FLASH_Queue_exec_gpu(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_flush_gpu(), FLASH_Queue_wait_dequeue(), and FLASH_Queue_wait_dequeue_block().

◆ FLASH_Queue_get_gpu_num_blocks()

dim_t FLASH_Queue_get_gpu_num_blocks ( void  )

◆ FLASH_Queue_init_gpu()

void FLASH_Queue_init_gpu ( void  )
29{
30 cublasInit();
31
32 return;
33}

Referenced by FLASH_Queue_init().

◆ FLASH_Queue_read_gpu()

FLA_Error FLASH_Queue_read_gpu ( FLA_Obj  obj,
void buffer_gpu 
)
211{
212 // Read the memory of a block on GPU to main memory.
214 FLA_Obj_width( obj ),
216 buffer_gpu,
217 FLA_Obj_length( obj ),
219 FLA_Obj_col_stride( obj ) );
220
221 return FLA_SUCCESS;
222}
dim_t FLA_Obj_width(FLA_Obj obj)
Definition FLA_Query.c:123
dim_t FLA_Obj_length(FLA_Obj obj)
Definition FLA_Query.c:116
dim_t FLA_Obj_col_stride(FLA_Obj obj)
Definition FLA_Query.c:174
void * FLA_Obj_buffer_at_view(FLA_Obj obj)
Definition FLA_Query.c:215
FLA_Datatype FLA_Obj_datatype(FLA_Obj obj)
Definition FLA_Query.c:13

References FLA_Obj_buffer_at_view(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_datatype_size(), FLA_Obj_length(), and FLA_Obj_width().

Referenced by FLASH_Queue_destroy_gpu(), FLASH_Queue_flush_block_gpu(), FLASH_Queue_flush_gpu(), and FLASH_Queue_update_block_gpu().

◆ FLASH_Queue_set_gpu_num_blocks()

void FLASH_Queue_set_gpu_num_blocks ( dim_t  n_blocks)
112{
113 flash_queue_gpu_n_blocks = n_blocks;
114
115 return;
116}

◆ FLASH_Queue_write_gpu()

FLA_Error FLASH_Queue_write_gpu ( FLA_Obj  obj,
void buffer_gpu 
)
191{
192 // Write the contents of a block in main memory to GPU.
194 FLA_Obj_width( obj ),
197 FLA_Obj_col_stride( obj ),
198 buffer_gpu,
199 FLA_Obj_length( obj ) );
200
201 return FLA_SUCCESS;
202}

References FLA_Obj_buffer_at_view(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_datatype_size(), FLA_Obj_length(), and FLA_Obj_width().

Referenced by FLASH_Queue_update_block_gpu().