libflame revision_anchor
Functions
FLASH_Queue_main_prototypes.h File Reference

(r)

Go to the source code of this file.

Functions

void FLASH_Queue_begin (void)
 
void FLASH_Queue_end (void)
 
unsigned int FLASH_Queue_stack_depth (void)
 
FLA_Error FLASH_Queue_enable (void)
 
FLA_Error FLASH_Queue_disable (void)
 
FLA_Bool FLASH_Queue_get_enabled (void)
 
void FLASH_Queue_set_num_threads (unsigned int n_threads)
 
unsigned int FLASH_Queue_get_num_threads (void)
 
void FLASH_Queue_init (void)
 
void FLASH_Queue_finalize (void)
 
unsigned int FLASH_Queue_get_num_tasks (void)
 
void FLASH_Queue_set_verbose_output (FLASH_Verbose verbose)
 
FLASH_Verbose FLASH_Queue_get_verbose_output (void)
 
void FLASH_Queue_set_sorting (FLA_Bool sorting)
 
FLA_Bool FLASH_Queue_get_sorting (void)
 
void FLASH_Queue_set_caching (FLA_Bool caching)
 
FLA_Bool FLASH_Queue_get_caching (void)
 
void FLASH_Queue_set_work_stealing (FLA_Bool work_stealing)
 
FLA_Bool FLASH_Queue_get_work_stealing (void)
 
void FLASH_Queue_set_data_affinity (FLASH_Data_aff data_affinity)
 
FLASH_Data_aff FLASH_Queue_get_data_affinity (void)
 
double FLASH_Queue_get_total_time (void)
 
double FLASH_Queue_get_parallel_time (void)
 
void FLASH_Queue_exec (void)
 
void FLASH_Queue_set_parallel_time (double dtime)
 
void FLASH_Queue_set_block_size (dim_t size)
 
dim_t FLASH_Queue_get_block_size (void)
 
void FLASH_Queue_set_cache_size (dim_t size)
 
dim_t FLASH_Queue_get_cache_size (void)
 
void FLASH_Queue_set_cache_line_size (dim_t size)
 
dim_t FLASH_Queue_get_cache_line_size (void)
 
void FLASH_Queue_set_cores_per_cache (int cores)
 
int FLASH_Queue_get_cores_per_cache (void)
 
void FLASH_Queue_set_cores_per_queue (int cores)
 
int FLASH_Queue_get_cores_per_queue (void)
 
void FLASH_Queue_reset (void)
 
FLASH_TaskFLASH_Queue_get_head_task (void)
 
FLASH_TaskFLASH_Queue_get_tail_task (void)
 
void FLASH_Queue_push (void *func, void *cntl, char *name, FLA_Bool enabled_gpu, int n_int_args, int n_fla_args, int n_input_args, int n_output_args,...)
 
void FLASH_Queue_push_input (FLA_Obj obj, FLASH_Task *t)
 
void FLASH_Queue_push_output (FLA_Obj obj, FLASH_Task *t)
 
FLASH_TaskFLASH_Task_alloc (void *func, void *cntl, char *name, FLA_Bool enabled_gpu, int n_int_args, int n_fla_args, int n_input_args, int n_output_args)
 
void FLASH_Task_free (FLASH_Task *t)
 
void FLASH_Queue_exec_task (FLASH_Task *t)
 
void FLASH_Queue_verbose_output (void)
 
void FLASH_Queue_init_tasks (void *arg)
 
void FLASH_Queue_wait_enqueue (FLASH_Task *t, void *arg)
 
FLASH_TaskFLASH_Queue_wait_dequeue (int queue, int cache, void *arg)
 
FLASH_TaskFLASH_Queue_wait_dequeue_block (int queue, int cache, void *arg)
 
void FLASH_Queue_update_cache (FLASH_Task *t, void *arg)
 
void FLASH_Queue_update_cache_block (FLA_Obj obj, int cache, FLA_Bool output, void *arg)
 
void FLASH_Queue_prefetch (int cache, void *arg)
 
void FLASH_Queue_prefetch_block (FLA_Obj obj)
 
FLASH_TaskFLASH_Queue_work_stealing (int queue, void *arg)
 
void FLASH_Queue_create_gpu (int thread, void *arg)
 
void FLASH_Queue_destroy_gpu (int thread, void *arg)
 
FLA_Bool FLASH_Queue_exec_gpu (FLASH_Task *t, void *arg)
 
FLA_Bool FLASH_Queue_check_gpu (FLASH_Task *t, void *arg)
 
FLA_Bool FLASH_Queue_check_block_gpu (FLA_Obj obj, int thread, void *arg)
 
void FLASH_Queue_update_gpu (FLASH_Task *t, void **input_arg, void **output_arg, void *arg)
 
void FLASH_Queue_update_block_gpu (FLA_Obj obj, void **buffer_gpu, int thread, void *arg)
 
void FLASH_Queue_mark_gpu (FLASH_Task *t, void *arg)
 
void FLASH_Queue_invalidate_block_gpu (FLA_Obj obj, int thread, void *arg)
 
void FLASH_Queue_flush_block_gpu (FLA_Obj obj, int thread, void *arg)
 
void FLASH_Queue_flush_gpu (int thread, void *arg)
 
void FLASH_Queue_exec_parallel (void *arg)
 
voidFLASH_Queue_exec_parallel_function (void *arg)
 
FLASH_TaskFLASH_Task_update_dependencies (FLASH_Task *t, void *arg)
 
FLASH_TaskFLASH_Task_update_binding (FLASH_Task *t, FLASH_Task *r, void *arg)
 
void FLASH_Task_free_parallel (FLASH_Task *t, void *arg)
 
void FLASH_Queue_exec_simulation (void *arg)
 

Function Documentation

◆ FLASH_Queue_begin()

void FLASH_Queue_begin ( void  )
65{
66#ifdef FLA_ENABLE_SUPERMATRIX
67 if ( flash_queue_stack == 0 )
68 {
69 // Save the starting time for the total execution time.
70 flash_queue_total_time = FLA_Clock();
71 }
72#endif
73
74 // Push onto the stack.
75 flash_queue_stack++;
76
77 return;
78}
double FLA_Clock(void)
Definition FLA_Clock.c:20

References FLA_Clock().

Referenced by FLASH_Apply_CAQ_UT_inc(), FLASH_Apply_Q2_UT(), FLASH_Apply_Q_UT(), FLASH_Apply_Q_UT_inc(), FLASH_Apply_QUD_UT_inc(), FLASH_CAQR_UT_inc_noopt(), FLASH_Chol(), FLASH_Copy(), FLASH_Copyr(), FLASH_Eig_gest(), FLASH_Gemm(), FLASH_Hemm(), FLASH_Her2k(), FLASH_Herk(), FLASH_LQ_UT(), FLASH_LU_incpiv_noopt(), FLASH_LU_incpiv_opt1(), FLASH_LU_nopiv(), FLASH_LU_piv(), FLASH_Lyap(), FLASH_QR2_UT(), FLASH_QR_UT(), FLASH_QR_UT_inc_noopt(), FLASH_QR_UT_inc_opt1(), FLASH_SPDinv(), FLASH_Sylv(), FLASH_Symm(), FLASH_Syr2k(), FLASH_Syrk(), FLASH_Trinv(), FLASH_Trmm(), FLASH_Trsm(), FLASH_Ttmm(), and FLASH_UDdate_UT_inc().

◆ FLASH_Queue_check_block_gpu()

FLA_Bool FLASH_Queue_check_block_gpu ( FLA_Obj  obj,
int  thread,
void arg 
)
1552{
1554 int k;
1557
1558#ifdef FLA_ENABLE_MULTITHREADING
1559 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1560#endif
1561
1562 // Locate the position of the block on the GPU.
1563 for ( k = 0; k < gpu_n_blocks; k++ )
1564 if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1565 break;
1566
1567 if ( k < gpu_n_blocks )
1568 {
1569 // Request this block if it is dirty.
1570 if ( !args->gpu[thread * gpu_n_blocks + k].clean )
1571 {
1572 args->gpu[thread * gpu_n_blocks + k].request = TRUE;
1573
1574 r_val = FALSE;
1575 }
1576 }
1577
1578 // Check the victim block.
1579 if ( obj.base == args->victim[thread].obj.base )
1580 r_val = FALSE;
1581
1582#ifdef FLA_ENABLE_MULTITHREADING
1583 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1584#endif
1585
1586 return r_val;
1587}
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition FLASH_Queue_gpu.c:119
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition FLA_Lock.c:58
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition FLA_Lock.c:43
unsigned long dim_t
Definition FLA_type_defs.h:71
int FLA_Bool
Definition FLA_type_defs.h:46
int i
Definition bl1_axmyv2.c:145
Definition FLASH_Queue_exec.c:55
FLA_Obj_gpu * victim
Definition FLASH_Queue_exec.c:107
FLA_Lock * gpu_lock
Definition FLASH_Queue_exec.c:101
FLA_Obj_gpu * gpu
Definition FLASH_Queue_exec.c:104
FLA_Obj obj
Definition FLASH_Queue_exec.c:40
FLA_Bool request
Definition FLASH_Queue_exec.c:49
FLA_Bool clean
Definition FLASH_Queue_exec.c:46
FLA_Base_obj * base
Definition FLA_type_defs.h:168

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, i, FLA_Obj_gpu_struct::obj, FLA_Obj_gpu_struct::request, and FLASH_Queue_variables::victim.

Referenced by FLASH_Queue_check_gpu().

◆ FLASH_Queue_check_gpu()

FLA_Bool FLASH_Queue_check_gpu ( FLASH_Task t,
void arg 
)
1453{
1454 int i, j, k;
1455 int thread = t->thread;
1456 int n_input_args = t->n_input_args;
1457 int n_output_args = t->n_output_args;
1462 FLA_Obj obj;
1463
1464 // Check the input and output arguments on the GPUs.
1465 for ( i = 0; i < n_input_args + n_output_args; i++ )
1466 {
1467 // Check for duplicate blocks.
1468 duplicate = FALSE;
1469
1470 // Find the correct input or output argument.
1471 if ( i < n_input_args )
1472 {
1473 obj = t->input_arg[i];
1474
1475 for ( j = 0; j < n_output_args && !duplicate; j++ )
1476 {
1477 if ( obj.base == t->output_arg[j].base )
1478 duplicate = TRUE;
1479 }
1480
1481 for ( j = 0; j < i && !duplicate; j++ )
1482 {
1483 if ( obj.base == t->input_arg[j].base )
1484 duplicate = TRUE;
1485 }
1486 }
1487 else
1488 {
1489 obj = t->output_arg[i - n_input_args];
1490
1491 for ( j = 0; j < i - n_input_args && !duplicate; j++ )
1492 {
1493 if ( obj.base == t->output_arg[j].base )
1494 duplicate = TRUE;
1495 }
1496 }
1497
1498 // If the block has not been processed before.
1499 if ( !duplicate )
1500 {
1501 // Macroblock is used.
1502 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
1503 {
1504 dim_t jj, kk;
1505 dim_t m = FLA_Obj_length( obj );
1506 dim_t n = FLA_Obj_width( obj );
1507 dim_t cs = FLA_Obj_col_stride( obj );
1508 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
1509
1510 // Clear each block in macroblock.
1511 for ( jj = 0; jj < n; jj++ )
1512 {
1513 for ( kk = 0; kk < m; kk++ )
1514 {
1515 obj = *( buf + jj * cs + kk );
1516
1517 t_val = TRUE;
1518
1519 // Check to see if the block is dirty on another GPU.
1520 for ( k = 0; k < n_threads && t_val; k++ )
1521 if ( k != thread )
1523
1524 r_val = r_val && t_val;
1525 }
1526 }
1527 }
1528 else
1529 {
1530 t_val = TRUE;
1531
1532 // Check to see if the block is dirty on another GPU.
1533 for ( k = 0; k < n_threads && t_val; k++ )
1534 if ( k != thread )
1536
1537 r_val = r_val && t_val;
1538 }
1539 }
1540 }
1541
1542 return r_val;
1543}
FLA_Bool FLASH_Queue_check_block_gpu(FLA_Obj obj, int thread, void *arg)
Definition FLASH_Queue_exec.c:1546
unsigned int FLASH_Queue_get_num_threads(void)
Definition FLASH_Queue.c:223
dim_t FLA_Obj_width(FLA_Obj obj)
Definition FLA_Query.c:123
dim_t FLA_Obj_length(FLA_Obj obj)
Definition FLA_Query.c:116
FLA_Elemtype FLA_Obj_elemtype(FLA_Obj obj)
Definition FLA_Query.c:51
dim_t FLA_Obj_col_stride(FLA_Obj obj)
Definition FLA_Query.c:174
Definition FLA_type_defs.h:159

References FLA_Obj_view::base, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_check_block_gpu(), FLASH_Queue_get_num_threads(), and i.

Referenced by FLASH_Queue_exec_gpu().

◆ FLASH_Queue_create_gpu()

void FLASH_Queue_create_gpu ( int  thread,
void arg 
)
1233{
1235 int i;
1237 dim_t block_size = args->block_size;
1238 FLA_Datatype datatype = args->datatype;
1239
1240 // Exit if not using GPU.
1242 return;
1243
1244 // Bind thread to GPU.
1245 FLASH_Queue_bind_gpu( thread );
1246
1247 // Allocate the memory on the GPU for all the blocks a priori.
1248 for ( i = 0; i < gpu_n_blocks; i++ )
1249 FLASH_Queue_alloc_gpu( block_size, datatype, &(args->gpu[thread * gpu_n_blocks + i].buffer_gpu) );
1250
1251 return;
1252}
FLA_Error FLASH_Queue_bind_gpu(int thread)
Definition FLASH_Queue_gpu.c:133
FLA_Bool FLASH_Queue_get_enabled_gpu(void)
Definition FLASH_Queue_gpu.c:91
FLA_Error FLASH_Queue_alloc_gpu(dim_t size, FLA_Datatype datatype, void **buffer_gpu)
Definition FLASH_Queue_gpu.c:147
int FLA_Datatype
Definition FLA_type_defs.h:49
dim_t block_size
Definition FLASH_Queue_exec.c:113
FLA_Datatype datatype
Definition FLASH_Queue_exec.c:116
void * buffer_gpu
Definition FLASH_Queue_exec.c:43

References FLASH_Queue_variables::block_size, FLA_Obj_gpu_struct::buffer_gpu, FLASH_Queue_variables::datatype, FLASH_Queue_alloc_gpu(), FLASH_Queue_bind_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, and i.

Referenced by FLASH_Queue_exec_parallel_function().

◆ FLASH_Queue_destroy_gpu()

void FLASH_Queue_destroy_gpu ( int  thread,
void arg 
)
1261{
1263 int i;
1266
1267 // Exit if not using GPU.
1269 return;
1270
1271 // Examine every block left on the GPU.
1272 for ( i = 0; i < gpu_n_blocks; i++ )
1273 {
1274 gpu_obj = args->gpu[thread * gpu_n_blocks + i];
1275
1276 // Flush the blocks that are dirty.
1277 if ( gpu_obj.obj.base != NULL && !gpu_obj.clean )
1278 FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
1279
1280 // Free the memory on the GPU for all the blocks.
1281 FLASH_Queue_free_gpu( gpu_obj.buffer_gpu );
1282 }
1283
1284 return;
1285}
FLA_Error FLASH_Queue_free_gpu(void *buffer_gpu)
Definition FLASH_Queue_gpu.c:171
FLA_Error FLASH_Queue_read_gpu(FLA_Obj obj, void *buffer_gpu)
Definition FLASH_Queue_gpu.c:205
Definition FLASH_Queue_exec.c:38

References FLASH_Queue_free_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, and i.

Referenced by FLASH_Queue_exec_parallel_function().

◆ FLASH_Queue_disable()

FLA_Error FLASH_Queue_disable ( void  )
150{
151#ifdef FLA_ENABLE_SUPERMATRIX
152 if ( flash_queue_stack == 0 )
153 {
154 // Disable if not begin parallel region yet.
155 flash_queue_enabled = FALSE;
156 return FLA_SUCCESS;
157 }
158 else
159 {
160 // Cannot change status during parallel region.
161 return FLA_FAILURE;
162 }
163#else
164 // Allow disabling enqueuing even when SuperMatrix is not configured.
165 flash_queue_enabled = FALSE;
166 return FLA_SUCCESS;
167#endif
168}

Referenced by FLASH_Apply_pivots(), FLASH_Axpy(), FLASH_Axpyt(), FLASH_Copyt(), FLASH_FS_incpiv(), FLASH_Gemv(), FLASH_Scal(), FLASH_Scalr(), and FLASH_Trsv().

◆ FLASH_Queue_enable()

FLA_Error FLASH_Queue_enable ( void  )
123{
124#ifdef FLA_ENABLE_SUPERMATRIX
125 if ( flash_queue_stack == 0 )
126 {
127 // Enable if not begin parallel region yet.
128 flash_queue_enabled = TRUE;
129 return FLA_SUCCESS;
130 }
131 else
132 {
133 // Cannot change status during parallel region.
134 return FLA_FAILURE;
135 }
136#else
137 // Raise an exception when SuperMatrix is not configured.
139 return FLA_FAILURE;
140#endif
141}

Referenced by FLASH_Apply_pivots(), FLASH_Axpy(), FLASH_Axpyt(), FLASH_Copyt(), FLASH_FS_incpiv(), FLASH_Gemv(), FLASH_Scal(), FLASH_Scalr(), and FLASH_Trsv().

◆ FLASH_Queue_end()

void FLASH_Queue_end ( void  )
87{
88 // Pop off the stack.
89 flash_queue_stack--;
90
91#ifdef FLA_ENABLE_SUPERMATRIX
92 if ( flash_queue_stack == 0 )
93 {
94 // Execute tasks if encounter the outermost parallel region.
96
97 // Find the total execution time.
98 flash_queue_total_time = FLA_Clock() - flash_queue_total_time;
99 }
100#endif
101
102 return;
103}
void FLASH_Queue_exec(void)
Definition FLASH_Queue_exec.c:2756

References FLA_Clock(), and FLASH_Queue_exec().

Referenced by FLASH_Apply_CAQ_UT_inc(), FLASH_Apply_Q2_UT(), FLASH_Apply_Q_UT(), FLASH_Apply_Q_UT_inc(), FLASH_Apply_QUD_UT_inc(), FLASH_CAQR_UT_inc_noopt(), FLASH_Chol(), FLASH_Copy(), FLASH_Copyr(), FLASH_Eig_gest(), FLASH_Gemm(), FLASH_Hemm(), FLASH_Her2k(), FLASH_Herk(), FLASH_LQ_UT(), FLASH_LU_incpiv_noopt(), FLASH_LU_incpiv_opt1(), FLASH_LU_nopiv(), FLASH_LU_piv(), FLASH_Lyap(), FLASH_QR2_UT(), FLASH_QR_UT(), FLASH_QR_UT_inc_noopt(), FLASH_QR_UT_inc_opt1(), FLASH_SPDinv(), FLASH_Sylv(), FLASH_Symm(), FLASH_Syr2k(), FLASH_Syrk(), FLASH_Trinv(), FLASH_Trmm(), FLASH_Trsm(), FLASH_Ttmm(), and FLASH_UDdate_UT_inc().

◆ FLASH_Queue_exec()

void FLASH_Queue_exec ( void  )
2762{
2763 int n_tasks = FLASH_Queue_get_num_tasks();
2764 int i;
2765 double dtime;
2766
2767 // All the necessary variables for the SuperMatrix mechanism.
2768 FLASH_Queue_vars args;
2769
2770 // If the queue is empty, return early.
2771 if ( n_tasks == 0 )
2772 return;
2773
2774 // Turn off all multiple queue implementations.
2777 // Do not use cache affinity yet.
2779
2780 // Allocate memory for task queues.
2781 args.task_queue = ( FLASH_Task** ) FLA_malloc( n_tasks * sizeof( FLASH_Task* ) );
2782 args.n_ready = ( int* ) FLA_shmalloc( n_tasks * sizeof( int ) );
2783 args.wait_queue = ( int* ) FLA_shmalloc( n_tasks * sizeof( int ) );
2784 args.n_wait = ( int* ) FLA_shmalloc( sizeof( int ) );
2785 args.pc = ( int* ) FLA_shmalloc( sizeof( int ) );
2786
2787 // Initialize data.
2788 if ( FLA_is_owner() )
2789 {
2790 args.n_wait[0] = 0;
2791 args.pc[0] = 0;
2792 }
2793
2794 Synch_all();
2795
2796 // Initialize tasks with critical information.
2797 FLASH_Queue_init_tasks( ( void* ) &args );
2798
2799 // Display verbose output before free all tasks.
2802
2803 // Start timing the parallel execution.
2804 dtime = RCCE_wtime();
2805
2806 FLASH_Queue_exec_parallel_function( ( void* ) &args );
2807
2808 // End timing the parallel execution.
2809 dtime = RCCE_wtime() - dtime;
2811
2812 // Free all tasks sequentially.
2813 for ( i = 0; i < n_tasks; i++ )
2814 FLASH_Task_free( args.task_queue[i] );
2815
2816 // Free data.
2817 FLA_free( args.task_queue );
2818 FLA_shfree( args.n_ready );
2819 FLA_shfree( args.wait_queue );
2820 FLA_shfree( args.n_wait );
2821 FLA_shfree( args.pc );
2822
2823 // Reset values for next call to FLASH_Queue_exec().
2825
2826 return;
2827}
void Synch_all()
void * FLASH_Queue_exec_parallel_function(void *arg)
Definition FLASH_Queue_exec.c:2156
void FLASH_Queue_init_tasks(void *arg)
Definition FLASH_Queue_exec.c:394
double RCCE_wtime(void)
void FLASH_Queue_set_data_affinity(FLASH_Data_aff data_affinity)
Definition FLASH_Queue.c:391
FLASH_Verbose FLASH_Queue_get_verbose_output(void)
Definition FLASH_Queue.c:308
void FLASH_Queue_set_caching(FLA_Bool caching)
Definition FLASH_Queue.c:343
void FLASH_Queue_set_parallel_time(double dtime)
Definition FLASH_Queue.c:448
void FLASH_Queue_verbose_output(void)
Definition FLASH_Queue.c:1782
void FLASH_Queue_set_work_stealing(FLA_Bool work_stealing)
Definition FLASH_Queue.c:367
void FLASH_Task_free(FLASH_Task *t)
Definition FLASH_Queue.c:1020
void FLASH_Queue_reset(void)
Definition FLASH_Queue.c:583
unsigned int FLASH_Queue_get_num_tasks(void)
Definition FLASH_Queue.c:284
void * FLA_shmalloc(size_t size)
Definition FLA_Obj.c:21
void FLA_shfree(void *ptr)
Definition FLA_Obj.c:27
void FLA_free(void *ptr)
Definition FLA_Memory.c:247
FLA_Bool FLA_is_owner(void)
Definition FLA_Obj.c:33
void * FLA_malloc(size_t size)
Definition FLA_Memory.c:111
int pc
Definition FLASH_Queue_exec.c:96
int * n_wait
Definition FLASH_Queue_exec.c:2748
int * n_ready
Definition FLASH_Queue_exec.c:2742
FLASH_Queue * wait_queue
Definition FLASH_Queue_exec.c:92
FLASH_Task ** task_queue
Definition FLASH_Queue_exec.c:2739
Definition FLA_type_defs.h:184

References FLASH_Queue_variables::all_lock, FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLASH_Queue_variables::cac_lock, FLASH_Queue_variables::cache, FLA_Obj_gpu_struct::clean, FLASH_Queue_variables::dep_lock, FLA_Clock(), FLA_free(), FLA_is_owner(), FLA_Lock_destroy(), FLA_Lock_init(), FLA_malloc(), FLA_shfree(), FLA_shmalloc(), FLASH_Queue_exec_parallel(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), FLASH_Queue_get_block_size(), FLASH_Queue_get_cache_size(), FLASH_Queue_get_caching(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_cores_per_queue(), FLASH_Queue_get_data_affinity(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLASH_Queue_get_work_stealing(), FLASH_Queue_init_tasks(), FLASH_Queue_reset(), FLASH_Queue_set_caching(), FLASH_Queue_set_data_affinity(), FLASH_Queue_set_parallel_time(), FLASH_Queue_set_work_stealing(), FLASH_Queue_verbose_output(), FLASH_Task_free(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Queue_variables::gpu_log, FLASH_Queue_s::head, i, FLASH_Queue_variables::n_caches, FLASH_Queue_variables::n_queues, FLASH_Queue_variables::n_ready, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLA_Obj_gpu_struct::obj, FLASH_Queue_variables::pc, FLASH_Queue_variables::prefetch, RCCE_wtime(), FLA_Obj_gpu_struct::request, FLASH_Queue_variables::run_lock, FLASH_Queue_variables::size, Synch_all(), FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, FLASH_Queue_variables::victim, FLASH_Queue_variables::wait_queue, and FLASH_Queue_variables::war_lock.

Referenced by FLASH_Queue_end().

◆ FLASH_Queue_exec_gpu()

FLA_Bool FLASH_Queue_exec_gpu ( FLASH_Task t,
void arg 
)
1294{
1295 void** input_arg;
1296 void** output_arg;
1297
1298 if ( t == NULL )
1299 return TRUE;
1300
1301 // If not using the GPU, then execute on CPU.
1303 {
1305
1306 return TRUE;
1307 }
1308
1309 // Check if all the operands are ready and up to date.
1310 if ( !FLASH_Queue_check_gpu( t, arg ) )
1311 {
1313 int queue = t->queue;
1314 t->hit = FALSE;
1315
1316#ifdef FLA_ENABLE_MULTITHREADING
1317 FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
1318#endif
1319 // Reenqueue the task if the blocks are not all flushed.
1321
1322#ifdef FLA_ENABLE_MULTITHREADING
1323 FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
1324#endif
1325
1326 return FALSE;
1327 }
1328
1329 // If GPU is enabled, but the task is not supported for GPU execution.
1330 if ( !t->enabled_gpu )
1331 {
1332 int i, j, k;
1333 int thread = t->thread;
1334 int n_input_args = t->n_input_args;
1335 int n_output_args = t->n_output_args;
1338 FLA_Obj obj;
1339
1340 // Check the blocks on each GPU.
1341 for ( k = 0; k < n_threads; k++ )
1342 {
1343 // Check the input and output arguments on the GPUs.
1344 for ( i = 0; i < n_input_args + n_output_args; i++ )
1345 {
1346 // Check for duplicate blocks.
1347 duplicate = FALSE;
1348
1349 // Find the correct input or output argument.
1350 if ( i < n_input_args )
1351 {
1352 obj = t->input_arg[i];
1353
1354 for ( j = 0; j < n_output_args && !duplicate; j++ )
1355 {
1356 if ( obj.base == t->output_arg[j].base )
1357 duplicate = TRUE;
1358 }
1359
1360 for ( j = 0; j < i && !duplicate; j++ )
1361 {
1362 if ( obj.base == t->input_arg[j].base )
1363 duplicate = TRUE;
1364 }
1365 }
1366 else
1367 {
1368 obj = t->output_arg[i - n_input_args];
1369
1370 for ( j = 0; j < i - n_input_args && !duplicate; j++ )
1371 {
1372 if ( obj.base == t->output_arg[j].base )
1373 duplicate = TRUE;
1374 }
1375 }
1376
1377 // If the block has not been processed before.
1378 if ( !duplicate )
1379 {
1380 // Macroblock is used.
1381 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
1382 {
1383 dim_t jj, kk;
1384 dim_t m = FLA_Obj_length( obj );
1385 dim_t n = FLA_Obj_width( obj );
1386 dim_t cs = FLA_Obj_col_stride( obj );
1387 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
1388
1389 // Clear each block in macroblock.
1390 for ( jj = 0; jj < n; jj++ )
1391 {
1392 for ( kk = 0; kk < m; kk++ )
1393 {
1394 obj = *( buf + jj * cs + kk );
1395
1396 // Flush the block to main memory if it is on the GPU.
1397 if ( k == thread )
1399
1400 // Invalidate output block on all GPUs.
1401 if ( i >= n_input_args )
1403 }
1404 }
1405 }
1406 else
1407 {
1408 // Flush the block to main memory if it is on the GPU.
1409 if ( k == thread )
1411
1412 // Invalidate output block on all GPUs.
1413 if ( i >= n_input_args )
1415 }
1416 }
1417 }
1418 }
1419
1420 // Execute the task on CPU instead of GPU.
1422
1423 return TRUE;
1424 }
1425
1426 // Gather the pointers for the data on the GPU.
1427 input_arg = ( void** ) FLA_malloc( t->n_input_args * sizeof( void* ) );
1428 output_arg = ( void** ) FLA_malloc( t->n_output_args * sizeof( void* ) );
1429
1430 // Bring all the blocks to GPU.
1431 FLASH_Queue_update_gpu( t, input_arg, output_arg, arg );
1432
1433 // Execute the task on GPU.
1434 FLASH_Queue_exec_task_gpu( t, input_arg, output_arg );
1435
1436 // Mark all the output blocks as dirty.
1438
1439 // Free memory.
1440 FLA_free( input_arg );
1441 FLA_free( output_arg );
1442
1443 return TRUE;
1444}
void FLASH_Queue_flush_block_gpu(FLA_Obj obj, int thread, void *arg)
Definition FLASH_Queue_exec.c:1893
void FLASH_Queue_update_gpu(FLASH_Task *t, void **input_arg, void **output_arg, void *arg)
Definition FLASH_Queue_exec.c:1590
void FLASH_Queue_invalidate_block_gpu(FLA_Obj obj, int thread, void *arg)
Definition FLASH_Queue_exec.c:1844
FLA_Bool FLASH_Queue_check_gpu(FLASH_Task *t, void *arg)
Definition FLASH_Queue_exec.c:1447
void FLASH_Queue_wait_enqueue(FLASH_Task *t, void *arg)
Definition FLASH_Queue_exec.c:626
void FLASH_Queue_mark_gpu(FLASH_Task *t, void *arg)
Definition FLASH_Queue_exec.c:1787
void FLASH_Queue_exec_task_gpu(FLASH_Task *t, void **input_arg, void **output_arg)
Definition FLASH_Queue_gpu.c:225
void FLASH_Queue_exec_task(FLASH_Task *t)
Definition FLASH_Queue.c:1141
FLA_Lock * run_lock
Definition FLASH_Queue_exec.c:62

References FLA_Obj_view::base, FLA_free(), FLA_Lock_acquire(), FLA_Lock_release(), FLA_malloc(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_check_gpu(), FLASH_Queue_exec_task(), FLASH_Queue_exec_task_gpu(), FLASH_Queue_flush_block_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_num_threads(), FLASH_Queue_invalidate_block_gpu(), FLASH_Queue_mark_gpu(), FLASH_Queue_update_gpu(), FLASH_Queue_wait_enqueue(), i, and FLASH_Queue_variables::run_lock.

Referenced by FLASH_Queue_exec_parallel_function().

◆ FLASH_Queue_exec_parallel()

void FLASH_Queue_exec_parallel ( void arg)
2049{
2050 int i;
2052 void* (*thread_entry_point)( void* );
2053
2054 // Allocate the thread structures array. Here, an array of FLASH_Thread
2055 // structures of length n_threads is allocated and the fields of each
2056 // structure set to appropriate values.
2057 FLASH_Thread* thread = ( FLASH_Thread* ) FLA_malloc( n_threads * sizeof( FLASH_Thread ) );
2058
2059 // Initialize the thread structures array.
2060 for ( i = 0; i < n_threads; i++ )
2061 {
2062 // Save the thread's identifier.
2063 thread[i].id = i;
2064
2065 // Save the pointer to the necessary variables with the thread.
2066 thread[i].args = arg;
2067
2068 // The pthread object, if it was even compiled into the FLASH_Thread
2069 // structure, will be initialized by the pthread implementation when we
2070 // call pthread_create() and does not need to be touched at this time.
2071 }
2072
2073 // Determine which function to send threads to.
2075
2076#if FLA_MULTITHREADING_MODEL == FLA_OPENMP
2077
2078 // An OpenMP parallel for region spawns n_threads threads. Each thread
2079 // executes the work function with a different FLASH_Thread argument.
2080 // An implicit synchronization point exists at the end of the curly
2081 // brace scope.
2082 #pragma omp parallel for \
2083 private( i ) \
2084 shared( thread, n_threads, thread_entry_point ) \
2085 schedule( static, 1 ) \
2086 num_threads( n_threads )
2087 for ( i = 0; i < n_threads; ++i )
2088 {
2089 thread_entry_point( ( void* ) &thread[i] );
2090 }
2091
2092#elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
2093
2094 // Create each POSIX thread needed in addition to the main thread.
2095 for ( i = 1; i < n_threads; i++ )
2096 {
2097 int pthread_e_val;
2098
2099 // Create thread i with default attributes.
2100 pthread_e_val = pthread_create( &(thread[i].pthread_obj),
2101 NULL,
2103 ( void* ) &thread[i] );
2104
2105#ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING
2108#endif
2109 }
2110
2111 // The main thread is assigned the role of thread 0. Here we manually
2112 // execute it as a worker thread.
2113 thread_entry_point( ( void* ) &thread[0] );
2114
2115 // Wait for non-main threads to finish.
2116 for ( i = 1; i < n_threads; i++ )
2117 {
2118 // These two variables are declared local to this for loop since this
2119 // is the only place they are needed, and since they would show up as
2120 // unused variables if FLA_MULTITHREADING_MODEL == FLA_PTHREADS.
2121 // Strangely, the Intel compiler produces code that results in an
2122 // "unaligned access" runtime message if thread_status is declared as
2123 // an int. Declaring it as a long or void* appears to force the
2124 // compiler (not surprisingly) into aligning it to an 8-byte boundary.
2125 int pthread_e_val;
2126 void* thread_status;
2127
2128 // Wait for thread i to invoke its respective pthread_exit().
2129 // The return value passed to pthread_exit() is provided to us
2130 // via status, if one was given.
2131 pthread_e_val = pthread_join( thread[i].pthread_obj,
2132 ( void** ) &thread_status );
2133
2134#ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING
2137#endif
2138 }
2139
2140#endif
2141
2142 FLA_free( thread );
2143
2144 return;
2145}
FLA_Error FLA_Check_pthread_create_result(int pthread_create_r_val)
Definition FLA_Check.c:750
FLA_Error FLA_Check_pthread_join_result(int pthread_join_r_val)
Definition FLA_Check.c:760
int FLA_Error
Definition FLA_type_defs.h:47
Definition FLA_type_defs.h:255
int id
Definition FLA_type_defs.h:257

References FLA_Check_pthread_create_result(), FLA_Check_pthread_join_result(), FLA_free(), FLA_malloc(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_get_num_threads(), i, and FLASH_Thread_s::id.

Referenced by FLASH_Queue_exec().

◆ FLASH_Queue_exec_parallel_function()

void * FLASH_Queue_exec_parallel_function ( void arg)
2988{
2990 int i = RCCE_ue();
2991 int queue = 0;
2992 int cache = 0;
2993 int n_tasks = FLASH_Queue_get_num_tasks();
2997 FLASH_Task* t = NULL;
2998
2999 // Do not let extraneous cores execute.
3000 if ( i < n_threads )
3001 condition = TRUE;
3002 else
3003 condition = FALSE;
3004
3005 // Loop until all the tasks have committed.
3006 while ( condition )
3007 {
3008 RCCE_acquire_lock( 0 );
3009
3010 // Obtain task to execute.
3011 t = FLASH_Queue_wait_dequeue( queue, cache, ( void* ) args );
3012
3013 RCCE_release_lock( 0 );
3014
3015 // Dequeued a task from the waiting queue.
3016 available = ( t != NULL );
3017
3018 if ( available )
3019 {
3020 // Save the thread and cache that executes the task.
3021 t->thread = i;
3022 t->cache = cache;
3023
3024 // Execute the task.
3026
3027 // Update task dependencies.
3028 FLASH_Task_update_dependencies( t, ( void* ) args );
3029 }
3030
3031 RCCE_acquire_lock( 0 );
3032
3033 // Terminate loop.
3034 if ( args->pc[0] >= n_tasks )
3035 condition = FALSE;
3036
3037 RCCE_release_lock( 0 );
3038 }
3039
3040 return ( void* ) NULL;
3041}
int RCCE_acquire_lock(int)
FLASH_Task * FLASH_Task_update_dependencies(FLASH_Task *t, void *arg)
Definition FLASH_Queue_exec.c:2316
FLASH_Task * FLASH_Queue_wait_dequeue(int queue, int cache, void *arg)
Definition FLASH_Queue_exec.c:678
int RCCE_ue(void)
int RCCE_release_lock(int)

References FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_create_gpu(), FLASH_Queue_destroy_gpu(), FLASH_Queue_exec_gpu(), FLASH_Queue_exec_task(), FLASH_Queue_flush_gpu(), FLASH_Queue_get_caching(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_work_stealing(), FLASH_Queue_prefetch(), FLASH_Queue_update_cache(), FLASH_Queue_wait_dequeue(), FLASH_Queue_work_stealing(), FLASH_Task_free_parallel(), FLASH_Task_update_dependencies(), i, FLASH_Queue_variables::n_queues, RCCE_acquire_lock(), RCCE_release_lock(), RCCE_ue(), and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec(), and FLASH_Queue_exec_parallel().

◆ FLASH_Queue_exec_simulation()

void FLASH_Queue_exec_simulation ( void arg)
2595{
2597 int i, j;
2598 int queue;
2599 int cache;
2600 int n_stages = 0;
2601 int n_queues = args->n_queues;
2602 int n_tasks = FLASH_Queue_get_num_tasks();
2606 FLASH_Task* task;
2607 FLASH_Task* t;
2608 FLASH_Dep* d;
2609
2610 // An array to hold tasks to be executed during of simulation.
2611#ifdef FLA_ENABLE_WINDOWS_BUILD
2613#else
2615#endif
2616
2617 for ( i = 0; i < n_threads; i++ )
2618 {
2619 // Initialize all exec_array to NULL.
2620 exec_array[i] = NULL;
2621
2622 // Prefetch blocks into the cache before execution.
2623 if ( i % n_cores == 0 )
2625 }
2626
2627 // Loop until all the tasks have committed.
2628 while ( args->pc < n_tasks )
2629 {
2630 for ( i = 0; i < n_threads; i++ )
2631 {
2632 // Update waiting queue with ready tasks.
2633 t = exec_array[i];
2634
2635 if ( t != NULL )
2636 {
2637 // Check each dependent task.
2638 d = t->dep_arg_head;
2639
2640 for ( j = 0; j < t->n_dep_args; j++ )
2641 {
2642 task = d->task;
2643 task->n_ready--;
2644
2645 // Place newly ready tasks on waiting queue.
2646 if ( task->n_ready == 0 )
2647 {
2649 }
2650
2651 // Go to the next dep.
2652 d = d->next_dep;
2653 }
2654
2655 // Free the task.
2656 FLASH_Task_free( t );
2657 }
2658 }
2659
2660 n_stages++;
2661 if ( !verbose )
2662 printf( "%7d", n_stages );
2663
2664 // Move ready tasks from the waiting queue to execution queue.
2665 for ( i = 0; i < n_threads; i++ )
2666 {
2667 // Determine to which queue this thread belongs.
2668 queue = i / ( n_threads / n_queues );
2669
2670 // Determine to which cache this thread belongs.
2671 cache = i / n_cores;
2672
2673 // Dequeue a task.
2674 t = FLASH_Queue_wait_dequeue( queue, cache, arg );
2675
2676 // Save the task for execution.
2677 exec_array[i] = t;
2678
2679 if ( t != NULL )
2680 {
2681 // Save the thread and cache that executes the task.
2682 t->thread = i;
2683 t->cache = cache;
2684
2685 // Increment program counter.
2686 args->pc++;
2687 }
2688 }
2689
2690 // Execute independent tasks.
2691 for ( i = 0; i < n_threads; i++ )
2692 {
2693 t = exec_array[i];
2696
2697 if ( !verbose )
2698 printf( "%7s", ( t == NULL ? " " : t->name ) );
2699
2700 // Free the task if this is the last stage.
2701 if ( args->pc == n_tasks && t != NULL )
2702 FLASH_Task_free( t );
2703 }
2704
2705 if ( !verbose )
2706 printf( "\n" );
2707 }
2708
2709 if ( !verbose )
2710 printf( "\n" );
2711
2712#ifdef FLA_ENABLE_WINDOWS_BUILD
2714#endif
2715
2716 return;
2717}
void FLASH_Queue_prefetch(int cache, void *arg)
Definition FLASH_Queue_exec.c:1024
void FLASH_Queue_update_cache(FLASH_Task *t, void *arg)
Definition FLASH_Queue_exec.c:847
int FLASH_Queue_get_cores_per_cache(void)
Definition FLASH_Queue.c:548
int FLASH_Verbose
Definition FLA_type_defs.h:113
Definition FLA_type_defs.h:245
FLASH_Task * task
Definition FLA_type_defs.h:247
FLASH_Dep * next_dep
Definition FLA_type_defs.h:250
int n_queues
Definition FLASH_Queue_exec.c:77
int n_ready
Definition FLA_type_defs.h:186

References FLA_free(), FLA_malloc(), FLASH_Queue_exec_task(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLASH_Queue_prefetch(), FLASH_Queue_update_cache(), FLASH_Queue_wait_dequeue(), FLASH_Queue_wait_enqueue(), FLASH_Task_free(), i, FLASH_Queue_variables::n_queues, FLASH_Task_s::n_ready, FLASH_Dep_s::next_dep, FLASH_Queue_variables::pc, and FLASH_Dep_s::task.

Referenced by FLASH_Queue_exec().

◆ FLASH_Queue_exec_task()

void FLASH_Queue_exec_task ( FLASH_Task t)
1147{
1148 // Define local function pointer types.
1149
1150 // LAPACK-level
1153 typedef FLA_Error(*flash_lu_piv_p)(FLA_Obj A, FLA_Obj p, fla_lu_t* cntl);
1158 typedef FLA_Error(*flash_lu_nopiv_p)(FLA_Obj A, fla_lu_t* cntl);
1160 typedef FLA_Error(*flash_ttmm_p)(FLA_Uplo uplo, FLA_Obj A, fla_ttmm_t* cntl);
1161 typedef FLA_Error(*flash_chol_p)(FLA_Uplo uplo, FLA_Obj A, fla_chol_t* cntl);
1165 typedef FLA_Error(*flash_qrut_p)(FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl);
1176
1177 // Level-3 BLAS
1187
1188 // Level-2 BLAS
1191
1192 // Level-1 BLAS
1195 typedef FLA_Error(*flash_copy_p)(FLA_Obj A, FLA_Obj B, fla_copy_t* cntl);
1197 typedef FLA_Error(*flash_copyr_p)(FLA_Uplo uplo, FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl);
1200
1201 // Base
1202 typedef FLA_Error(*flash_obj_create_buffer_p)(dim_t rs, dim_t cs, FLA_Obj A, void* cntl);
1203 typedef FLA_Error(*flash_obj_free_buffer_p)(FLA_Obj A, void* cntl);
1204
1205 // Only execute task if it is not NULL.
1206 if ( t == NULL )
1207 return;
1208
1209 // Now "switch" between the various possible task functions.
1210
1211 // FLA_LU_piv_macro
1212 if ( t->func == (void *) FLA_LU_piv_macro_task )
1213 {
1215 func = (flash_lu_piv_macro_p) t->func;
1216
1217 func( t->output_arg[0],
1218 t->output_arg[1],
1219 ( fla_lu_t* ) t->cntl );
1220 }
1221 // FLA_Apply_pivots_macro
1222 else if ( t->func == (void *) FLA_Apply_pivots_macro_task )
1223 {
1225 func = (flash_apply_pivots_macro_p) t->func;
1226
1227 func( ( FLA_Side ) t->int_arg[0],
1228 ( FLA_Trans ) t->int_arg[1],
1229 t->input_arg[0],
1230 t->output_arg[0],
1231 ( fla_appiv_t* ) t->cntl );
1232 }
1233 // FLA_LU_piv
1234 else if ( t->func == (void *) FLA_LU_piv_task )
1235 {
1236 flash_lu_piv_p func;
1237 func = (flash_lu_piv_p) t->func;
1238
1239 func( t->output_arg[0],
1240 t->fla_arg[0],
1241 ( fla_lu_t* ) t->cntl );
1242 }
1243 // FLA_LU_piv_copy
1244 else if ( t->func == (void *) FLA_LU_piv_copy_task )
1245 {
1247 func = (flash_lu_piv_copy_p) t->func;
1248
1249 func( t->output_arg[0],
1250 t->fla_arg[0],
1251 t->output_arg[1],
1252 ( fla_lu_t* ) t->cntl );
1253 }
1254 // FLA_Trsm_piv
1255 else if ( t->func == (void *) FLA_Trsm_piv_task )
1256 {
1257 flash_trsm_piv_p func;
1258 func = (flash_trsm_piv_p) t->func;
1259
1260 func( t->input_arg[0],
1261 t->output_arg[0],
1262 t->fla_arg[0],
1263 ( fla_trsm_t* ) t->cntl );
1264 }
1265 // FLA_SA_LU
1266 else if ( t->func == (void *) FLA_SA_LU_task )
1267 {
1268 flash_sa_lu_p func;
1269 func = (flash_sa_lu_p) t->func;
1270
1271 func( t->output_arg[1],
1272 t->output_arg[0],
1273 t->fla_arg[0],
1274 t->fla_arg[1],
1275 t->int_arg[0],
1276 ( fla_lu_t* ) t->cntl );
1277 }
1278 // FLA_SA_FS
1279 else if ( t->func == (void *) FLA_SA_FS_task )
1280 {
1281 flash_sa_fs_p func;
1282 func = (flash_sa_fs_p) t->func;
1283
1284 func( t->fla_arg[0],
1285 t->input_arg[0],
1286 t->fla_arg[1],
1287 t->output_arg[1],
1288 t->output_arg[0],
1289 t->int_arg[0],
1290 ( fla_gemm_t* ) t->cntl );
1291 }
1292 // FLA_LU_nopiv
1293 else if ( t->func == (void *) FLA_LU_nopiv_task )
1294 {
1295 flash_lu_nopiv_p func;
1296 func = (flash_lu_nopiv_p) t->func;
1297
1298 func( t->output_arg[0],
1299 ( fla_lu_t* ) t->cntl );
1300 }
1301 // FLA_Trinv
1302 else if ( t->func == (void *) FLA_Trinv_task )
1303 {
1304 flash_trinv_p func;
1305 func = (flash_trinv_p) t->func;
1306
1307 func( ( FLA_Uplo ) t->int_arg[0],
1308 ( FLA_Diag ) t->int_arg[1],
1309 t->output_arg[0],
1310 ( fla_trinv_t* ) t->cntl );
1311 }
1312 // FLA_Ttmm
1313 else if ( t->func == (void *) FLA_Ttmm_task )
1314 {
1315 flash_ttmm_p func;
1316 func = (flash_ttmm_p) t->func;
1317
1318 func( ( FLA_Uplo ) t->int_arg[0],
1319 t->output_arg[0],
1320 ( fla_ttmm_t* ) t->cntl );
1321 }
1322 // FLA_Chol
1323 else if ( t->func == (void *) FLA_Chol_task )
1324 {
1325 flash_chol_p func;
1326 func = (flash_chol_p) t->func;
1327
1328 func( ( FLA_Uplo ) t->int_arg[0],
1329 t->output_arg[0],
1330 ( fla_chol_t* ) t->cntl );
1331 }
1332 // FLA_Sylv
1333 else if ( t->func == (void *) FLA_Sylv_task )
1334 {
1335 flash_sylv_p func;
1336 func = (flash_sylv_p) t->func;
1337
1338 func( ( FLA_Trans ) t->int_arg[0],
1339 ( FLA_Trans ) t->int_arg[1],
1340 t->fla_arg[0],
1341 t->input_arg[0],
1342 t->input_arg[1],
1343 t->output_arg[0],
1344 t->fla_arg[1],
1345 ( fla_sylv_t* ) t->cntl );
1346 }
1347 // FLA_Lyap
1348 else if ( t->func == (void *) FLA_Lyap_task )
1349 {
1350 flash_lyap_p func;
1351 func = (flash_lyap_p) t->func;
1352
1353 func( ( FLA_Trans ) t->int_arg[0],
1354 t->fla_arg[0],
1355 t->input_arg[0],
1356 t->output_arg[0],
1357 t->fla_arg[1],
1358 ( fla_lyap_t* ) t->cntl );
1359 }
1360 // FLA_QR_UT_macro
1361 else if ( t->func == (void *) FLA_QR_UT_macro_task )
1362 {
1363 flash_qrut_macro_p func;
1364 func = (flash_qrut_macro_p) t->func;
1365
1366 func( t->output_arg[0],
1367 t->output_arg[1],
1368 ( fla_qrut_t* ) t->cntl );
1369 }
1370 // FLA_QR_UT
1371 else if ( t->func == (void *) FLA_QR_UT_task )
1372 {
1373 flash_qrut_p func;
1374 func = (flash_qrut_p) t->func;
1375
1376 func( t->output_arg[0],
1377 t->fla_arg[0],
1378 ( fla_qrut_t* ) t->cntl );
1379 }
1380 // FLA_QR_UT_copy
1381 else if ( t->func == (void *) FLA_QR_UT_copy_task )
1382 {
1383 flash_qrutc_p func;
1384 func = (flash_qrutc_p) t->func;
1385
1386 func( t->output_arg[0],
1387 t->fla_arg[0],
1388 t->output_arg[1],
1389 ( fla_qrut_t* ) t->cntl );
1390 }
1391 // FLA_QR2_UT
1392 else if ( t->func == (void *) FLA_QR2_UT_task )
1393 {
1394 flash_qr2ut_p func;
1395 func = (flash_qr2ut_p) t->func;
1396
1397 func( t->output_arg[1],
1398 t->output_arg[0],
1399 t->fla_arg[0],
1400 ( fla_qr2ut_t* ) t->cntl );
1401 }
1402 // FLA_LQ_UT_macro
1403 else if ( t->func == (void *) FLA_LQ_UT_macro_task )
1404 {
1405 flash_lqut_macro_p func;
1406 func = (flash_lqut_macro_p) t->func;
1407
1408 func( t->output_arg[0],
1409 t->output_arg[1],
1410 ( fla_lqut_t* ) t->cntl );
1411 }
1412 // FLA_CAQR2_UT
1413 else if ( t->func == (void *) FLA_CAQR2_UT_task )
1414 {
1415 flash_caqr2ut_p func;
1416 func = (flash_caqr2ut_p) t->func;
1417
1418 func( t->output_arg[1],
1419 t->output_arg[0],
1420 t->fla_arg[0],
1421 ( fla_caqr2ut_t* ) t->cntl );
1422 }
1423 // FLA_UDdate_UT
1424 else if ( t->func == (void *) FLA_UDdate_UT_task )
1425 {
1426 flash_uddateut_p func;
1427 func = (flash_uddateut_p) t->func;
1428
1429 func( t->output_arg[0],
1430 t->output_arg[1],
1431 t->output_arg[2],
1432 t->output_arg[3],
1433 ( fla_uddateut_t* ) t->cntl );
1434 }
1435 // FLA_Apply_Q_UT
1436 else if ( t->func == (void *) FLA_Apply_Q_UT_task )
1437 {
1438 flash_apqut_p func;
1439 func = (flash_apqut_p) t->func;
1440
1441 func( ( FLA_Side ) t->int_arg[0],
1442 ( FLA_Trans ) t->int_arg[1],
1443 ( FLA_Direct ) t->int_arg[2],
1444 ( FLA_Store ) t->int_arg[3],
1445 t->input_arg[0],
1446 t->fla_arg[0],
1447 t->output_arg[1],
1448 t->output_arg[0],
1449 ( fla_apqut_t* ) t->cntl );
1450 }
1451 // FLA_Apply_Q2_UT
1452 else if ( t->func == (void *) FLA_Apply_Q2_UT_task )
1453 {
1454 flash_apq2ut_p func;
1455 func = (flash_apq2ut_p) t->func;
1456
1457 func( ( FLA_Side ) t->int_arg[0],
1458 ( FLA_Trans ) t->int_arg[1],
1459 ( FLA_Direct ) t->int_arg[2],
1460 ( FLA_Store ) t->int_arg[3],
1461 t->input_arg[0],
1462 t->fla_arg[0],
1463 t->output_arg[2],
1464 t->output_arg[1],
1465 t->output_arg[0],
1466 ( fla_apq2ut_t* ) t->cntl );
1467 }
1468 // FLA_Apply_CAQ2_UT
1469 else if ( t->func == (void *) FLA_Apply_CAQ2_UT_task )
1470 {
1471 flash_apcaq2ut_p func;
1472 func = (flash_apcaq2ut_p) t->func;
1473
1474 func( ( FLA_Side ) t->int_arg[0],
1475 ( FLA_Trans ) t->int_arg[1],
1476 ( FLA_Direct ) t->int_arg[2],
1477 ( FLA_Store ) t->int_arg[3],
1478 t->input_arg[0],
1479 t->fla_arg[0],
1480 t->output_arg[2],
1481 t->output_arg[1],
1482 t->output_arg[0],
1483 ( fla_apcaq2ut_t* ) t->cntl );
1484 }
1485 // FLA_Apply_QUD_UT
1486 else if ( t->func == (void *) FLA_Apply_QUD_UT_task )
1487 {
1488 flash_apqudut_p func;
1489 func = (flash_apqudut_p) t->func;
1490
1491 func( ( FLA_Side ) t->int_arg[0],
1492 ( FLA_Trans ) t->int_arg[1],
1493 ( FLA_Direct ) t->int_arg[2],
1494 ( FLA_Store ) t->int_arg[3],
1495 t->input_arg[0],
1496 t->output_arg[0],
1497 t->output_arg[1],
1498 t->input_arg[1],
1499 t->output_arg[2],
1500 t->input_arg[2],
1501 t->output_arg[3],
1502 ( fla_apqudut_t* ) t->cntl );
1503 }
1504 // FLA_Eig_gest
1505 else if ( t->func == (void *) FLA_Eig_gest_task )
1506 {
1507 flash_eig_gest_p func;
1508 func = (flash_eig_gest_p) t->func;
1509
1510 func( ( FLA_Inv ) t->int_arg[0],
1511 ( FLA_Uplo ) t->int_arg[1],
1512 t->output_arg[1],
1513 t->output_arg[0],
1514 t->input_arg[0],
1515 ( fla_eig_gest_t* ) t->cntl );
1516 }
1517 // FLA_Gemm
1518 else if ( t->func == (void *) FLA_Gemm_task )
1519 {
1520 flash_gemm_p func;
1521 func = (flash_gemm_p) t->func;
1522
1523 func( ( FLA_Trans ) t->int_arg[0],
1524 ( FLA_Trans ) t->int_arg[1],
1525 t->fla_arg[0],
1526 t->input_arg[0],
1527 t->input_arg[1],
1528 t->fla_arg[1],
1529 t->output_arg[0],
1530 ( fla_gemm_t* ) t->cntl );
1531 }
1532 // FLA_Hemm
1533 else if ( t->func == (void *) FLA_Hemm_task )
1534 {
1535 flash_hemm_p func;
1536 func = (flash_hemm_p) t->func;
1537
1538 func( ( FLA_Side ) t->int_arg[0],
1539 ( FLA_Uplo ) t->int_arg[1],
1540 t->fla_arg[0],
1541 t->input_arg[0],
1542 t->input_arg[1],
1543 t->fla_arg[1],
1544 t->output_arg[0],
1545 ( fla_hemm_t* ) t->cntl );
1546 }
1547 // FLA_Herk
1548 else if ( t->func == (void *) FLA_Herk_task )
1549 {
1550 flash_herk_p func;
1551 func = (flash_herk_p) t->func;
1552
1553 func( ( FLA_Uplo ) t->int_arg[0],
1554 ( FLA_Trans ) t->int_arg[1],
1555 t->fla_arg[0],
1556 t->input_arg[0],
1557 t->fla_arg[1],
1558 t->output_arg[0],
1559 ( fla_herk_t* ) t->cntl );
1560 }
1561 // FLA_Her2k
1562 else if ( t->func == (void *) FLA_Her2k_task )
1563 {
1564 flash_her2k_p func;
1565 func = (flash_her2k_p) t->func;
1566
1567 func( ( FLA_Uplo ) t->int_arg[0],
1568 ( FLA_Trans ) t->int_arg[1],
1569 t->fla_arg[0],
1570 t->input_arg[0],
1571 t->input_arg[1],
1572 t->fla_arg[1],
1573 t->output_arg[0],
1574 ( fla_her2k_t* ) t->cntl );
1575 }
1576 // FLA_Symm
1577 else if ( t->func == (void *) FLA_Symm_task )
1578 {
1579 flash_symm_p func;
1580 func = (flash_symm_p) t->func;
1581
1582 func( ( FLA_Side ) t->int_arg[0],
1583 ( FLA_Uplo ) t->int_arg[1],
1584 t->fla_arg[0],
1585 t->input_arg[0],
1586 t->input_arg[1],
1587 t->fla_arg[1],
1588 t->output_arg[0],
1589 ( fla_symm_t* ) t->cntl );
1590 }
1591 // FLA_Syrk
1592 else if ( t->func == (void *) FLA_Syrk_task )
1593 {
1594 flash_syrk_p func;
1595 func = (flash_syrk_p) t->func;
1596
1597 func( ( FLA_Uplo ) t->int_arg[0],
1598 ( FLA_Trans ) t->int_arg[1],
1599 t->fla_arg[0],
1600 t->input_arg[0],
1601 t->fla_arg[1],
1602 t->output_arg[0],
1603 ( fla_syrk_t* ) t->cntl );
1604 }
1605 // FLA_Syr2k
1606 else if ( t->func == (void *) FLA_Syr2k_task )
1607 {
1608 flash_syr2k_p func;
1609 func = (flash_syr2k_p) t->func;
1610
1611 func( ( FLA_Uplo ) t->int_arg[0],
1612 ( FLA_Trans ) t->int_arg[1],
1613 t->fla_arg[0],
1614 t->input_arg[0],
1615 t->input_arg[1],
1616 t->fla_arg[1],
1617 t->output_arg[0],
1618 ( fla_syr2k_t* ) t->cntl );
1619 }
1620 // FLA_Trmm
1621 else if ( t->func == (void *) FLA_Trmm_task )
1622 {
1623 flash_trmm_p func;
1624 func = (flash_trmm_p) t->func;
1625
1626 func( ( FLA_Side ) t->int_arg[0],
1627 ( FLA_Uplo ) t->int_arg[1],
1628 ( FLA_Trans ) t->int_arg[2],
1629 ( FLA_Diag ) t->int_arg[3],
1630 t->fla_arg[0],
1631 t->input_arg[0],
1632 t->output_arg[0],
1633 ( fla_trmm_t* ) t->cntl );
1634 }
1635 // FLA_Trsm
1636 else if ( t->func == (void *) FLA_Trsm_task )
1637 {
1638 flash_trsm_p func;
1639 func = (flash_trsm_p) t->func;
1640
1641 func( ( FLA_Side ) t->int_arg[0],
1642 ( FLA_Uplo ) t->int_arg[1],
1643 ( FLA_Trans ) t->int_arg[2],
1644 ( FLA_Diag ) t->int_arg[3],
1645 t->fla_arg[0],
1646 t->input_arg[0],
1647 t->output_arg[0],
1648 ( fla_trsm_t* ) t->cntl );
1649 }
1650 // FLA_Gemv
1651 else if ( t->func == (void *) FLA_Gemv_task )
1652 {
1653 flash_gemv_p func;
1654 func = (flash_gemv_p) t->func;
1655
1656 func( ( FLA_Trans ) t->int_arg[0],
1657 t->fla_arg[0],
1658 t->input_arg[0],
1659 t->input_arg[1],
1660 t->fla_arg[1],
1661 t->output_arg[0],
1662 ( fla_gemv_t* ) t->cntl );
1663 }
1664 // FLA_Trsv
1665 else if ( t->func == (void *) FLA_Trsv_task )
1666 {
1667 flash_trsv_p func;
1668 func = (flash_trsv_p) t->func;
1669
1670 func( ( FLA_Uplo ) t->int_arg[0],
1671 ( FLA_Trans ) t->int_arg[1],
1672 ( FLA_Diag ) t->int_arg[2],
1673 t->input_arg[0],
1674 t->output_arg[0],
1675 ( fla_trsv_t* ) t->cntl );
1676 }
1677 // FLA_Axpy
1678 else if ( t->func == (void *) FLA_Axpy_task )
1679 {
1680 flash_axpy_p func;
1681 func = (flash_axpy_p) t->func;
1682
1683 func( t->fla_arg[0],
1684 t->input_arg[0],
1685 t->output_arg[0],
1686 ( fla_axpy_t* ) t->cntl );
1687 }
1688 // FLA_Axpyt
1689 else if ( t->func == (void *) FLA_Axpyt_task )
1690 {
1691 flash_axpyt_p func;
1692 func = (flash_axpyt_p) t->func;
1693
1694 func( ( FLA_Trans ) t->int_arg[0],
1695 t->fla_arg[0],
1696 t->input_arg[0],
1697 t->output_arg[0],
1698 ( fla_axpyt_t* ) t->cntl );
1699 }
1700 // FLA_Copy
1701 else if ( t->func == (void *) FLA_Copy_task )
1702 {
1703 flash_copy_p func;
1704 func = (flash_copy_p) t->func;
1705
1706 func( t->input_arg[0],
1707 t->output_arg[0],
1708 ( fla_copy_t* ) t->cntl );
1709 }
1710 // FLA_Copyt
1711 else if ( t->func == (void *) FLA_Copyt_task )
1712 {
1713 flash_copyt_p func;
1714 func = (flash_copyt_p) t->func;
1715
1716 func( ( FLA_Trans ) t->int_arg[0],
1717 t->input_arg[0],
1718 t->output_arg[0],
1719 ( fla_copyt_t* ) t->cntl );
1720 }
1721 // FLA_Copyr
1722 else if ( t->func == (void *) FLA_Copyr_task )
1723 {
1724 flash_copyr_p func;
1725 func = (flash_copyr_p) t->func;
1726
1727 func( ( FLA_Uplo ) t->int_arg[0],
1728 t->input_arg[0],
1729 t->output_arg[0],
1730 ( fla_copyr_t* ) t->cntl );
1731 }
1732 // FLA_Scal
1733 else if ( t->func == (void *) FLA_Scal_task )
1734 {
1735 flash_scal_p func;
1736 func = (flash_scal_p) t->func;
1737
1738 func( t->fla_arg[0],
1739 t->output_arg[0],
1740 ( fla_scal_t* ) t->cntl );
1741 }
1742 // FLA_Scalr
1743 else if ( t->func == (void *) FLA_Scalr_task )
1744 {
1745 flash_scalr_p func;
1746 func = (flash_scalr_p) t->func;
1747
1748 func( ( FLA_Uplo ) t->int_arg[0],
1749 t->fla_arg[0],
1750 t->output_arg[0],
1751 ( fla_scalr_t* ) t->cntl );
1752 }
1753 // FLA_Obj_create_buffer
1754 else if ( t->func == (void *) FLA_Obj_create_buffer_task )
1755 {
1757 func = (flash_obj_create_buffer_p) t->func;
1758
1759 func( ( dim_t ) t->int_arg[0],
1760 ( dim_t ) t->int_arg[1],
1761 t->output_arg[0],
1762 t->cntl );
1763 }
1764 // FLA_Obj_free_buffer
1765 else if ( t->func == (void *) FLA_Obj_free_buffer_task )
1766 {
1768 func = (flash_obj_free_buffer_p) t->func;
1769
1770 func( t->output_arg[0],
1771 t->cntl );
1772 }
1773 else
1774 {
1776 }
1777
1778 return;
1779}
FLA_Error FLA_Scal_task(FLA_Obj alpha, FLA_Obj A, fla_scal_t *cntl)
Definition FLA_Scal_task.c:13
FLA_Error FLA_Scalr_task(FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, fla_scalr_t *cntl)
Definition FLA_Scalr_task.c:13
FLA_Error FLA_Copy_task(FLA_Obj A, FLA_Obj B, fla_copy_t *cntl)
Definition FLA_Copy_task.c:13
FLA_Error FLA_Copyt_task(FLA_Trans trans, FLA_Obj A, FLA_Obj B, fla_copyt_t *cntl)
Definition FLA_Copyt_task.c:13
FLA_Error FLA_Axpy_task(FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpy_t *cntl)
Definition FLA_Axpy_task.c:13
FLA_Error FLA_Copyr_task(FLA_Uplo uplo, FLA_Obj A, FLA_Obj B, fla_copyr_t *cntl)
Definition FLA_Copyr_task.c:13
FLA_Error FLA_Axpyt_task(FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t *cntl)
Definition FLA_Axpyt_task.c:13
FLA_Error FLA_Gemv_task(FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t *cntl)
Definition FLA_Gemv_task.c:13
FLA_Error FLA_Trsv_task(FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t *cntl)
Definition FLA_Trsv_task.c:13
FLA_Error FLA_Syrk_task(FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t *cntl)
Definition FLA_Syrk_task.c:13
FLA_Error FLA_Herk_task(FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t *cntl)
Definition FLA_Herk_task.c:13
FLA_Error FLA_Trmm_task(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t *cntl)
Definition FLA_Trmm_task.c:13
FLA_Error FLA_Hemm_task(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t *cntl)
Definition FLA_Hemm_task.c:13
FLA_Error FLA_Her2k_task(FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t *cntl)
Definition FLA_Her2k_task.c:13
FLA_Error FLA_Syr2k_task(FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t *cntl)
Definition FLA_Syr2k_task.c:13
FLA_Error FLA_Symm_task(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t *cntl)
Definition FLA_Symm_task.c:13
FLA_Error FLA_Trsm_task(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t *cntl)
Definition FLA_Trsm_task.c:13
FLA_Error FLA_Gemm_task(FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t *cntl)
Definition FLA_Gemm_task.c:13
FLA_Error FLA_LU_piv_copy_task(FLA_Obj A, FLA_Obj p, FLA_Obj U, fla_lu_t *cntl)
Definition FLA_LU_piv_copy_task.c:13
FLA_Error FLA_LQ_UT_macro_task(FLA_Obj A, FLA_Obj T, fla_lqut_t *cntl)
Definition FLA_LQ_UT_macro_task.c:15
FLA_Error FLA_QR_UT_task(FLA_Obj A, FLA_Obj T, fla_qrut_t *cntl)
Definition FLA_QR_UT_task.c:15
FLA_Error FLA_QR_UT_copy_task(FLA_Obj A, FLA_Obj T, FLA_Obj U, fla_qrut_t *cntl)
Definition FLA_QR_UT_copy_task.c:15
FLA_Error FLA_Lyap_task(FLA_Trans trans, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t *cntl)
Definition FLA_Lyap_task.c:15
FLA_Error FLA_Chol_task(FLA_Uplo uplo, FLA_Obj A, fla_chol_t *cntl)
Definition FLA_Chol_task.c:15
FLA_Error FLA_Trinv_task(FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A, fla_trinv_t *cntl)
Definition FLA_Trinv_task.c:15
FLA_Error FLA_Trsm_piv_task(FLA_Obj A, FLA_Obj B, FLA_Obj p, fla_trsm_t *cntl)
Definition FLA_Trsm_piv_task.c:13
FLA_Error FLA_Apply_Q_UT_task(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t *cntl)
Definition FLA_Apply_Q_UT_task.c:15
FLA_Error FLA_Apply_Q2_UT_task(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apq2ut_t *cntl)
Definition FLA_Apply_Q2_UT_task.c:15
FLA_Error FLA_LU_piv_macro_task(FLA_Obj A, FLA_Obj p, fla_lu_t *cntl)
Definition FLA_LU_piv_macro_task.c:13
FLA_Error FLA_Ttmm_task(FLA_Uplo uplo, FLA_Obj A, fla_ttmm_t *cntl)
Definition FLA_Ttmm_task.c:15
FLA_Error FLA_SA_FS_task(FLA_Obj L, FLA_Obj D, FLA_Obj p, FLA_Obj C, FLA_Obj E, dim_t nb_alg, fla_gemm_t *cntl)
Definition FLA_SA_FS_task.c:13
FLA_Error FLA_QR_UT_macro_task(FLA_Obj A, FLA_Obj T, fla_qrut_t *cntl)
Definition FLA_QR_UT_macro_task.c:15
FLA_Error FLA_QR2_UT_task(FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_qr2ut_t *cntl)
Definition FLA_QR2_UT_task.c:15
FLA_Error FLA_Sylv_task(FLA_Trans transa, FLA_Trans transb, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t *cntl)
Definition FLA_Sylv_task.c:15
FLA_Error FLA_SA_LU_task(FLA_Obj U, FLA_Obj D, FLA_Obj p, FLA_Obj L, dim_t nb_alg, fla_lu_t *cntl)
Definition FLA_SA_LU_task.c:13
FLA_Error FLA_LU_nopiv_task(FLA_Obj A, fla_lu_t *cntl)
Definition FLA_LU_nopiv_task.c:15
FLA_Error FLA_Apply_pivots_macro_task(FLA_Side side, FLA_Trans trans, FLA_Obj p, FLA_Obj A, fla_appiv_t *cntl)
Definition FLA_Apply_pivots_macro_task.c:15
FLA_Error FLA_CAQR2_UT_task(FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_caqr2ut_t *cntl)
Definition FLA_CAQR2_UT_task.c:15
FLA_Error FLA_UDdate_UT_task(FLA_Obj R, FLA_Obj C, FLA_Obj D, FLA_Obj T, fla_uddateut_t *cntl)
Definition FLA_UDdate_UT_task.c:15
FLA_Error FLA_LU_piv_task(FLA_Obj A, FLA_Obj p, fla_lu_t *cntl)
Definition FLA_LU_piv_task.c:15
FLA_Error FLA_Apply_CAQ2_UT_task(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apcaq2ut_t *cntl)
Definition FLA_Apply_CAQ2_UT_task.c:15
FLA_Error FLA_Apply_QUD_UT_task(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj T, FLA_Obj W, FLA_Obj R, FLA_Obj U, FLA_Obj C, FLA_Obj V, FLA_Obj D, fla_apqudut_t *cntl)
Definition FLA_Apply_QUD_UT_task.c:15
FLA_Error FLA_Eig_gest_task(FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t *cntl)
Definition FLA_Eig_gest_task.c:16
FLA_Error FLA_Obj_create_buffer_task(dim_t rs, dim_t cs, FLA_Obj obj, void *cntl)
Definition FLA_Obj_create_buffer_task.c:13
FLA_Error FLA_Obj_free_buffer_task(FLA_Obj obj, void *cntl)
Definition FLA_Obj_free_buffer_task.c:13
int FLA_Side
Definition FLA_type_defs.h:51
int FLA_Inv
Definition FLA_type_defs.h:63
int FLA_Trans
Definition FLA_type_defs.h:53
int FLA_Store
Definition FLA_type_defs.h:59
int FLA_Uplo
Definition FLA_type_defs.h:52
int FLA_Diag
Definition FLA_type_defs.h:55
int FLA_Direct
Definition FLA_type_defs.h:58
Definition FLA_Cntl_lapack.h:264
Definition FLA_Cntl_lapack.h:43
Definition FLA_Cntl_lapack.h:228
Definition FLA_Cntl_lapack.h:318
Definition FLA_Cntl_lapack.h:211
Definition FLA_Cntl_blas1.h:17
Definition FLA_Cntl_blas1.h:27
Definition FLA_Cntl_lapack.h:106
Definition FLA_Cntl_lapack.h:17
Definition FLA_Cntl_blas1.h:37
Definition FLA_Cntl_blas1.h:57
Definition FLA_Cntl_blas1.h:47
Definition FLA_Cntl_lapack.h:356
Definition FLA_Cntl_blas3.h:17
Definition FLA_Cntl_blas2.h:17
Definition FLA_Cntl_blas3.h:28
Definition FLA_Cntl_blas3.h:53
Definition FLA_Cntl_blas3.h:41
Definition FLA_Cntl_lapack.h:96
Definition FLA_Cntl_lapack.h:53
Definition FLA_Cntl_lapack.h:183
Definition FLA_Cntl_lapack.h:81
Definition FLA_Cntl_lapack.h:70
Definition FLA_Cntl_blas1.h:68
Definition FLA_Cntl_blas1.h:78
Definition FLA_Cntl_lapack.h:163
Definition FLA_Cntl_blas3.h:66
Definition FLA_Cntl_blas3.h:91
Definition FLA_Cntl_blas3.h:79
Definition FLA_Cntl_lapack.h:149
Definition FLA_Cntl_blas3.h:104
Definition FLA_Cntl_blas3.h:116
Definition FLA_Cntl_blas2.h:27
Definition FLA_Cntl_lapack.h:30
Definition FLA_Cntl_lapack.h:307

References FLASH_Task_s::cntl, FLA_Apply_CAQ2_UT_task(), FLA_Apply_pivots_macro_task(), FLA_Apply_Q2_UT_task(), FLA_Apply_Q_UT_task(), FLA_Apply_QUD_UT_task(), FLASH_Task_s::fla_arg, FLA_Axpy_task(), FLA_Axpyt_task(), FLA_CAQR2_UT_task(), FLA_Chol_task(), FLA_Copy_task(), FLA_Copyr_task(), FLA_Copyt_task(), FLA_Eig_gest_task(), FLA_Gemm_task(), FLA_Gemv_task(), FLA_Hemm_task(), FLA_Her2k_task(), FLA_Herk_task(), FLA_LQ_UT_macro_task(), FLA_LU_nopiv_task(), FLA_LU_piv_copy_task(), FLA_LU_piv_macro_task(), FLA_LU_piv_task(), FLA_Lyap_task(), FLA_Obj_create_buffer_task(), FLA_Obj_free_buffer_task(), FLA_QR2_UT_task(), FLA_QR_UT_copy_task(), FLA_QR_UT_macro_task(), FLA_QR_UT_task(), FLA_SA_FS_task(), FLA_SA_LU_task(), FLA_Scal_task(), FLA_Scalr_task(), FLA_Sylv_task(), FLA_Symm_task(), FLA_Syr2k_task(), FLA_Syrk_task(), FLA_Trinv_task(), FLA_Trmm_task(), FLA_Trsm_piv_task(), FLA_Trsm_task(), FLA_Trsv_task(), FLA_Ttmm_task(), FLA_UDdate_UT_task(), FLASH_Task_s::func, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, and FLASH_Task_s::output_arg.

Referenced by FLASH_Queue_exec_gpu(), FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().

◆ FLASH_Queue_finalize()

void FLASH_Queue_finalize ( void  )
268{
269 // Exit early if we're not already initialized.
270 if ( flash_queue_initialized == FALSE )
271 return;
272
273 // Clear the initialized flag.
274 flash_queue_initialized = FALSE;
275
276#ifdef FLA_ENABLE_GPU
278#endif
279
280 return;
281}
void FLASH_Queue_finalize_gpu(void)
Definition FLASH_Queue_gpu.c:36

References FLASH_Queue_finalize_gpu().

Referenced by FLA_Finalize().

◆ FLASH_Queue_flush_block_gpu()

void FLASH_Queue_flush_block_gpu ( FLA_Obj  obj,
int  thread,
void arg 
)
1899{
1901 int k;
1905
1906#ifdef FLA_ENABLE_MULTITHREADING
1907 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1908#endif
1909
1910 // Locate the position of the block on the GPU.
1911 for ( k = 0; k < gpu_n_blocks; k++ )
1912 if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1913 break;
1914
1915 // The block is owned by the GPU.
1916 if ( k < gpu_n_blocks )
1917 {
1918 // Save the block that will be flushed.
1919 gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1920
1921 // If the block is dirty, then flush it.
1922 if ( gpu_obj.obj.base != NULL && !gpu_obj.clean )
1923 transfer = TRUE;
1924 }
1925
1926#ifdef FLA_ENABLE_MULTITHREADING
1927 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1928#endif
1929
1930 // Exit early if a flush is not required.
1931 if ( !transfer )
1932 return;
1933
1934 // Flush the block outside the critical section.
1935 FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
1936
1937#ifdef FLA_ENABLE_MULTITHREADING
1938 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1939#endif
1940
1941 // Locate the position of the block on the GPU.
1942 for ( k = 0; k < gpu_n_blocks; k++ )
1943 if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1944 break;
1945
1946 if ( k < gpu_n_blocks )
1947 {
1948 // Update the bits for the flushed block.
1949 args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
1950 args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1951 }
1952
1953#ifdef FLA_ENABLE_MULTITHREADING
1954 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1955#endif
1956
1957 return;
1958}

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, i, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.

Referenced by FLASH_Queue_exec_gpu().

◆ FLASH_Queue_flush_gpu()

void FLASH_Queue_flush_gpu ( int  thread,
void arg 
)
1967{
1969 int i, k;
1971 int n_transfer = 0;
1973
1974 // Exit if not using GPU.
1976 return;
1977
1978#ifdef FLA_ENABLE_MULTITHREADING
1979 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1980#endif
1981
1982 for ( k = 0; k < gpu_n_blocks; k++ )
1983 {
1984 // Save the block that might be flushed.
1985 gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1986
1987 // Flush the block if it is dirty and requested.
1988 if ( gpu_obj.obj.base != NULL && !gpu_obj.clean && gpu_obj.request )
1989 {
1990 // Save the block for data transfer outside the critical section.
1991 args->gpu_log[thread * gpu_n_blocks + n_transfer] = gpu_obj;
1992 n_transfer++;
1993 }
1994 }
1995
1996#ifdef FLA_ENABLE_MULTITHREADING
1997 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1998#endif
1999
2000 // Exit early if a flush is not required.
2001 if ( n_transfer == 0 )
2002 return;
2003
2004 // Flush the block outside the critical section.
2005 for ( i = 0; i < n_transfer; i++ )
2006 {
2007 gpu_obj = args->gpu_log[thread * gpu_n_blocks + i];
2008 FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
2009 }
2010
2011#ifdef FLA_ENABLE_MULTITHREADING
2012 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
2013#endif
2014
2015 // Update the bits for each block that is flushed.
2016 for ( i = 0; i < n_transfer; i++ )
2017 {
2018 // Locate the position of the block on the GPU.
2019 for ( k = 0; k < gpu_n_blocks; k++ )
2020 if ( args->gpu_log[thread * gpu_n_blocks + i].obj.base ==
2021 args->gpu[thread * gpu_n_blocks + k].obj.base )
2022 break;
2023
2024 if ( k < gpu_n_blocks )
2025 {
2026 // The block is now clean.
2027 args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
2028 args->gpu[thread * gpu_n_blocks + k].request = FALSE;
2029 }
2030 }
2031
2032#ifdef FLA_ENABLE_MULTITHREADING
2033 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
2034#endif
2035
2036 return;
2037}
FLA_Obj_gpu * gpu_log
Definition FLASH_Queue_exec.c:110

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Queue_variables::gpu_log, i, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.

Referenced by FLASH_Queue_exec_parallel_function().

◆ FLASH_Queue_get_block_size()

dim_t FLASH_Queue_get_block_size ( void  )
482{
483 return flash_queue_block_size;
484}

Referenced by FLASH_Queue_exec().

◆ FLASH_Queue_get_cache_line_size()

dim_t FLASH_Queue_get_cache_line_size ( void  )
530{
531 return flash_queue_cache_line_size;
532}

Referenced by FLASH_Queue_prefetch_block().

◆ FLASH_Queue_get_cache_size()

dim_t FLASH_Queue_get_cache_size ( void  )
506{
507 return flash_queue_cache_size;
508}

Referenced by FLASH_Queue_exec().

◆ FLASH_Queue_get_caching()

FLA_Bool FLASH_Queue_get_caching ( void  )

◆ FLASH_Queue_get_cores_per_cache()

int FLASH_Queue_get_cores_per_cache ( void  )
554{
555 return flash_queue_cores_per_cache;
556}

Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().

◆ FLASH_Queue_get_cores_per_queue()

int FLASH_Queue_get_cores_per_queue ( void  )
578{
579 return flash_queue_cores_per_queue;
580}

Referenced by FLASH_Queue_exec().

◆ FLASH_Queue_get_data_affinity()

FLASH_Data_aff FLASH_Queue_get_data_affinity ( void  )
410{
411 return flash_queue_data_affinity;
412}

Referenced by FLASH_Queue_exec(), FLASH_Queue_init_tasks(), and FLASH_Queue_verbose_output().

◆ FLASH_Queue_get_enabled()

FLA_Bool FLASH_Queue_get_enabled ( void  )

◆ FLASH_Queue_get_head_task()

FLASH_Task * FLASH_Queue_get_head_task ( void  )
609{
610 return _tq.head;
611}
FLASH_Queue _tq
Definition FLASH_Queue.c:27
FLASH_Task * head
Definition FLA_type_defs.h:179

References _tq, and FLASH_Queue_s::head.

Referenced by FLASH_Queue_init_tasks(), and FLASH_Queue_verbose_output().

◆ FLASH_Queue_get_num_tasks()

unsigned int FLASH_Queue_get_num_tasks ( void  )

◆ FLASH_Queue_get_num_threads()

unsigned int FLASH_Queue_get_num_threads ( void  )

◆ FLASH_Queue_get_parallel_time()

double FLASH_Queue_get_parallel_time ( void  )
436{
437 // Only return time if out of parallel region.
438 if ( flash_queue_stack == 0 )
439 return flash_queue_parallel_time;
440
441 return 0.0;
442}

◆ FLASH_Queue_get_sorting()

FLA_Bool FLASH_Queue_get_sorting ( void  )
338{
339 return flash_queue_sorting;
340}

Referenced by FLASH_Queue_wait_enqueue(), and FLASH_Task_update_binding().

◆ FLASH_Queue_get_tail_task()

FLASH_Task * FLASH_Queue_get_tail_task ( void  )
620{
621 return _tq.tail;
622}
FLASH_Task * tail
Definition FLA_type_defs.h:180

References _tq, and FLASH_Queue_s::tail.

Referenced by FLASH_Queue_init_tasks().

◆ FLASH_Queue_get_total_time()

double FLASH_Queue_get_total_time ( void  )
421{
422 // Only return time if out of parallel region.
423 if ( flash_queue_stack == 0 )
424 return flash_queue_total_time;
425
426 return 0.0;
427}

◆ FLASH_Queue_get_verbose_output()

FLASH_Verbose FLASH_Queue_get_verbose_output ( void  )
314{
315 return flash_queue_verbose;
316}

Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_simulation(), and FLASH_Queue_verbose_output().

◆ FLASH_Queue_get_work_stealing()

FLA_Bool FLASH_Queue_get_work_stealing ( void  )
386{
387 return flash_queue_work_stealing;
388}

Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_parallel_function(), and FLASH_Task_update_dependencies().

◆ FLASH_Queue_init()

void FLASH_Queue_init ( void  )
243{
244 // Exit early if we're already initialized.
245 if ( flash_queue_initialized == TRUE )
246 return;
247
248 // Reset all the initial values.
250
251 // Set the initialized flag.
252 flash_queue_initialized = TRUE;
253
254#ifdef FLA_ENABLE_GPU
256#endif
257
258 return;
259}
void FLASH_Queue_reset(void)
Definition FLASH_Queue.c:583
void FLASH_Queue_init_gpu(void)
Definition FLASH_Queue_gpu.c:23

References FLASH_Queue_init_gpu(), and FLASH_Queue_reset().

Referenced by FLA_Init().

◆ FLASH_Queue_init_tasks()

void FLASH_Queue_init_tasks ( void arg)
2836{
2838 int i, j;
2839 int n_tasks = FLASH_Queue_get_num_tasks();
2840 int n_ready = 0;
2841 int height;
2842 FLASH_Task* t;
2843 FLASH_Dep* d;
2844
2845 // Grab the tail of the task queue.
2847
2848 for ( i = n_tasks - 1; i >= 0; i-- )
2849 {
2850 // Save all the task pointers.
2851 args->task_queue[i] = t;
2852
2853 // Only use a single queue implementation.
2854 t->queue = 0;
2855
2856 // Determine the height of each task in the DAG.
2857 height = 0;
2858 d = t->dep_arg_head;
2859
2860 // Take the maximum height of dependent tasks.
2861 for ( j = 0; j < t->n_dep_args; j++ )
2862 {
2863 height = max( height, d->task->height );
2864 d = d->next_dep;
2865 }
2866
2867 t->height = height + 1;
2868
2869 // Since freeing a task is always a leaf, we want to force it to execute
2870 // earlier by giving it a greater height in order to reclaim memory.
2871 if ( t->func == (void *) FLA_Obj_free_buffer_task )
2872 t->height += n_tasks;
2873
2874 // Find all ready tasks.
2875 t->n_ready += t->n_input_args + t->n_output_args +
2876 t->n_macro_args + t->n_war_args;
2877
2878 if ( t->n_ready == 0 )
2879 {
2880 // Save the number of ready and available tasks.
2881 n_ready++;
2882 }
2883
2884 if ( FLA_is_owner() )
2885 {
2886 // Record all the ready values.
2887 args->n_ready[i] = t->n_ready;
2888 }
2889
2890 // Go to the previous task.
2891 t = t->prev_task;
2892 }
2893
2894 // Only allow the first core to enqueue the initial ready tasks.
2895 if ( !FLA_is_owner() )
2896 return;
2897
2898 // Grab the head of the task queue.
2900
2901 for ( i = 0; i < n_tasks && n_ready > 0; i++ )
2902 {
2903 if ( t->n_ready == 0 )
2904 {
2905 RCCE_acquire_lock( 0 );
2906
2907 // Enqueue all the ready and available tasks.
2909
2910 RCCE_release_lock( 0 );
2911
2912 // Decrement the number of ready tasks left to be enqueued.
2913 n_ready--;
2914 }
2915
2916 // Go to the next task.
2917 t = t->next_task;
2918 }
2919
2920 return;
2921}
FLASH_Task * FLASH_Queue_get_head_task(void)
Definition FLASH_Queue.c:603
FLASH_Task * FLASH_Queue_get_tail_task(void)
Definition FLASH_Queue.c:614
int queue
Definition FLA_type_defs.h:190
int height
Definition FLA_type_defs.h:191

References FLA_Obj_view::base, FLASH_Queue_variables::block_size, FLASH_Queue_variables::datatype, FLASH_Task_s::dep_arg_head, FLA_is_owner(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_datatype_size(), FLA_Obj_elemtype(), FLA_Obj_free_buffer_task(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_data_affinity(), FLASH_Queue_get_head_task(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_tail_task(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::func, FLASH_Task_s::height, i, FLASH_Task_s::input_arg, FLA_Obj_struct::m_index, FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_index, FLASH_Task_s::n_input_args, FLASH_Task_s::n_macro_args, FLASH_Task_s::n_output_args, FLASH_Queue_variables::n_queues, FLASH_Task_s::n_ready, FLASH_Queue_variables::n_ready, FLASH_Task_s::n_war_args, FLA_Obj_struct::n_write_blocks, FLASH_Dep_s::next_dep, FLASH_Task_s::output_arg, FLASH_Queue_variables::prefetch, FLASH_Task_s::prev_task, FLASH_Task_s::queue, RCCE_acquire_lock(), RCCE_release_lock(), FLASH_Queue_variables::size, FLASH_Dep_s::task, and FLASH_Queue_variables::task_queue.

Referenced by FLASH_Queue_exec().

◆ FLASH_Queue_invalidate_block_gpu()

void FLASH_Queue_invalidate_block_gpu ( FLA_Obj  obj,
int  thread,
void arg 
)
1850{
1852 int j, k;
1855
1856#ifdef FLA_ENABLE_MULTITHREADING
1857 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1858#endif
1859
1860 // Locate the position of the block on the GPU.
1861 for ( k = 0; k < gpu_n_blocks; k++ )
1862 if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1863 break;
1864
1865 // The block is owned by other GPU.
1866 if ( k < gpu_n_blocks )
1867 {
1868 // Invalidate the block.
1869 args->gpu[thread * gpu_n_blocks + k].obj.base = NULL;
1870
1871 args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
1872 args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1873
1874 // Save the block that will be invalidated.
1875 gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1876
1877 // Shift all the blocks for the invalidated block.
1878 for ( j = k; j < gpu_n_blocks - 1; j++ )
1879 args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j + 1];
1880
1881 // Move to the LRU block.
1882 args->gpu[thread * gpu_n_blocks + gpu_n_blocks - 1] = gpu_obj;
1883 }
1884
1885#ifdef FLA_ENABLE_MULTITHREADING
1886 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1887#endif
1888
1889 return;
1890}

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, i, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.

Referenced by FLASH_Queue_exec_gpu(), and FLASH_Queue_update_gpu().

◆ FLASH_Queue_mark_gpu()

void FLASH_Queue_mark_gpu ( FLASH_Task t,
void arg 
)
1793{
1795 int i, j, k;
1796 int thread = t->thread;
1799 FLA_Obj obj;
1800
1801 // Mark all the output blocks on the GPU as dirty.
1802 for ( i = t->n_output_args - 1; i >= 0; i-- )
1803 {
1804 obj = t->output_arg[i];
1805
1806 // Check for duplicate blocks.
1807 duplicate = FALSE;
1808
1809 for ( j = 0; j < i && !duplicate; j++ )
1810 {
1811 if ( obj.base == t->output_arg[j].base )
1812 duplicate = TRUE;
1813 }
1814
1815 // If the output block has not been processed before.
1816 if ( !duplicate )
1817 {
1818#ifdef FLA_ENABLE_MULTITHREADING
1819 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1820#endif
1821
1822 // Locate the position of the block on the GPU.
1823 for ( k = 0; k < gpu_n_blocks; k++ )
1824 if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1825 break;
1826
1827 if ( k < gpu_n_blocks )
1828 {
1829 // Change the bits for the new dirty block.
1830 args->gpu[thread * gpu_n_blocks + k].clean = FALSE;
1831 args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1832 }
1833
1834#ifdef FLA_ENABLE_MULTITHREADING
1835 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1836#endif
1837 }
1838 }
1839
1840 return;
1841}

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, i, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.

Referenced by FLASH_Queue_exec_gpu().

◆ FLASH_Queue_prefetch()

void FLASH_Queue_prefetch ( int  cache,
void arg 
)
1030{
1032 int i;
1033 int size = args->size;
1034 FLA_Obj obj;
1035
1036 // Prefetch blocks in opposite order to maintain LRU.
1037 for ( i = size - 1; i >= 0; i-- )
1038 {
1039 obj = args->prefetch[i];
1040
1041 // Only prefetch if it is a valid block.
1042 if ( obj.base != NULL )
1043 {
1044 // Prefetch the block.
1046
1047 // Record the prefetched block in the cache.
1048 args->cache[cache * size + i] = obj;
1049 }
1050 }
1051
1052 return;
1053}
void FLASH_Queue_prefetch_block(FLA_Obj obj)
Definition FLASH_Queue_exec.c:1056
FLA_Obj * cache
Definition FLASH_Queue_exec.c:86
FLA_Obj * prefetch
Definition FLASH_Queue_exec.c:89
int size
Definition FLASH_Queue_exec.c:83

References FLA_Obj_view::base, FLASH_Queue_variables::cache, FLASH_Queue_prefetch_block(), i, FLASH_Queue_variables::prefetch, and FLASH_Queue_variables::size.

Referenced by FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().

◆ FLASH_Queue_prefetch_block()

void FLASH_Queue_prefetch_block ( FLA_Obj  obj)
1062{
1063 int i, inc;
1065 int elem_size = FLA_Obj_elem_size( obj );
1066 int length = FLA_Obj_length( obj );
1067 int width = FLA_Obj_width( obj );
1068 FLA_Datatype datatype = FLA_Obj_datatype( obj );
1069
1070 // Determine stride to prefetch block into cache.
1072
1073 // Switch between the four different datatypes.
1074 switch ( datatype )
1075 {
1076 case FLA_FLOAT:
1077 {
1078 float *buffer = ( float * ) FLA_FLOAT_PTR( obj );
1079 float access;
1080
1081 // Access each cache line of the block.
1082 for ( i = 0; i < length * width; i += inc )
1083 access = buffer[i];
1084
1085 // Prevent dead code elimination.
1086 access += 1.0;
1087
1088 break;
1089 }
1090 case FLA_DOUBLE:
1091 {
1092 double *buffer = ( double * ) FLA_DOUBLE_PTR( obj );
1093 double access;
1094
1095 // Access each cache line of the block.
1096 for ( i = 0; i < length * width; i += inc )
1097 access = buffer[i];
1098
1099 // Prevent dead code elimination.
1100 access += 1.0;
1101
1102 break;
1103 }
1104 case FLA_COMPLEX:
1105 {
1106 scomplex *buffer = ( scomplex * ) FLA_COMPLEX_PTR( obj );
1108
1109 // Access each cache line of the block.
1110 for ( i = 0; i < length * width; i += inc )
1111 access = buffer[i];
1112
1113 // Prevent dead code elimination.
1114 access.real += 1.0;
1115
1116 break;
1117 }
1118 case FLA_DOUBLE_COMPLEX:
1119 {
1120 dcomplex *buffer = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( obj );
1122
1123 // Access each cache line of the block.
1124 for ( i = 0; i < length * width; i += inc )
1125 access = buffer[i];
1126
1127 // Prevent dead code elimination.
1128 access.real += 1.0;
1129
1130 break;
1131 }
1132 case FLA_INT:
1133 {
1134 int *buffer = ( int * ) FLA_INT_PTR( obj );
1135 int access;
1136
1137 // Access each cache line of the block.
1138 for ( i = 0; i < length * width; i += inc )
1139 access = buffer[i];
1140
1141 // Prevent dead code elimination.
1142 access += 1.0;
1143
1144 break;
1145 }
1146 default:
1147 // This default case should never execute.
1149 }
1150
1151 return;
1152}
dim_t FLASH_Queue_get_cache_line_size(void)
Definition FLASH_Queue.c:524
dim_t FLA_Obj_elem_size(FLA_Obj obj)
Definition FLA_Query.c:95
FLA_Datatype FLA_Obj_datatype(FLA_Obj obj)
Definition FLA_Query.c:13
Definition blis_type_defs.h:138
double real
Definition blis_type_defs.h:139
Definition blis_type_defs.h:133
float real
Definition blis_type_defs.h:134

References FLA_Obj_datatype(), FLA_Obj_elem_size(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_cache_line_size(), i, scomplex::real, and dcomplex::real.

Referenced by FLASH_Queue_prefetch().

◆ FLASH_Queue_push()

void FLASH_Queue_push ( void func,
void cntl,
char name,
FLA_Bool  enabled_gpu,
int  n_int_args,
int  n_fla_args,
int  n_input_args,
int  n_output_args,
  ... 
)
639{
640 int i;
642 FLASH_Task* t;
643 FLA_Obj obj;
644
645 // Allocate a new FLA_Task and populate its fields with appropriate values.
646 t = FLASH_Task_alloc( func, cntl, name, enabled_gpu,
647 n_int_args, n_fla_args,
648 n_input_args, n_output_args );
649
650 // Initialize variable argument environment. In case you're wondering, the
651 // second argument in this macro invocation of va_start() is supposed to be
652 // the parameter that immediately preceeds the variable argument list
653 // (ie: the ... above ).
654 va_start( var_arg_list, n_output_args );
655
656 // Extract the integer arguments.
657 for ( i = 0; i < n_int_args; i++ )
658 t->int_arg[i] = va_arg( var_arg_list, int );
659
660 // Extract the FLA_Obj arguments.
661 for ( i = 0; i < n_fla_args; i++ )
662 t->fla_arg[i] = va_arg( var_arg_list, FLA_Obj );
663
664 // Extract the input FLA_Obj arguments.
665 for ( i = 0; i < n_input_args; i++ )
666 {
667 obj = va_arg( var_arg_list, FLA_Obj );
668 t->input_arg[i] = obj;
669
670 // Macroblock is used.
671 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
672 {
673 dim_t jj, kk;
674 dim_t m = FLA_Obj_length( obj );
675 dim_t n = FLA_Obj_width( obj );
676 dim_t cs = FLA_Obj_col_stride( obj );
677 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
678
679 // Dependence analysis for each input block in macroblock.
680 for ( jj = 0; jj < n; jj++ )
681 for ( kk = 0; kk < m; kk++ )
682 FLASH_Queue_push_input( *( buf + jj * cs + kk ), t );
683
684 // Set the number of blocks in the macroblock subtracted by one
685 // since we do not want to recount an operand for each n_input_arg.
686 t->n_macro_args += m * n - 1;
687 }
688 else // Regular block.
689 {
690 // Dependence analysis for input operand.
692 }
693 }
694
695 // Extract the output FLA_Obj arguments.
696 for ( i = 0; i < n_output_args; i++ )
697 {
698 obj = va_arg( var_arg_list, FLA_Obj );
699 t->output_arg[i] = obj;
700
701 // Only assign data affinity to the first output block.
702 if ( i == 0 )
703 {
704 FLA_Obj buf = obj;
705
706 // Use the top left block of the macroblock.
707 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
708 buf = *FLASH_OBJ_PTR_AT( obj );
709
710 if ( buf.base->write_task == NULL )
711 t->queue = flash_queue_n_write_blocks;
712 else
713 t->queue = buf.base->write_task->queue;
714 }
715
716 // Macroblock is used.
717 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
718 {
719 dim_t jj, kk;
720 dim_t m = FLA_Obj_length( obj );
721 dim_t n = FLA_Obj_width( obj );
722 dim_t cs = FLA_Obj_col_stride( obj );
723 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
724
725 // Dependence analysis for each output block in macroblock.
726 for ( jj = 0; jj < n; jj++ )
727 for ( kk = 0; kk < m; kk++ )
728 FLASH_Queue_push_output( *( buf + jj * cs + kk ), t );
729
730 // Set the number of blocks in the macroblock subtracted by one
731 // since we do not want to recount an operand for each n_output_arg.
732 t->n_macro_args += m * n - 1;
733 }
734 else // Regular block.
735 {
736 // Dependence analysis for output operand.
738 }
739 }
740
741 // Finalize the variable argument environment.
743
744 // Add the task to the tail of the queue (and the head if queue is empty).
745 if ( _tq.n_tasks == 0 )
746 {
747 _tq.head = t;
748 _tq.tail = t;
749 }
750 else
751 {
752 t->prev_task = _tq.tail;
753 _tq.tail->next_task = t;
754 _tq.tail = t;
755
756 // Determine the index of the task in the task queue.
757 t->order = t->prev_task->order + 1;
758 }
759
760 // Increment the number of tasks.
761 _tq.n_tasks++;
762
763 return;
764}
void FLASH_Queue_push_output(FLA_Obj obj, FLASH_Task *t)
Definition FLASH_Queue.c:842
FLASH_Task * FLASH_Task_alloc(void *func, void *cntl, char *name, FLA_Bool enabled_gpu, int n_int_args, int n_fla_args, int n_input_args, int n_output_args)
Definition FLASH_Queue.c:956
void FLASH_Queue_push_input(FLA_Obj obj, FLASH_Task *t)
Definition FLASH_Queue.c:767
FLASH_Task * next_task
Definition FLA_type_defs.h:237
int order
Definition FLA_type_defs.h:189
FLASH_Task * prev_task
Definition FLA_type_defs.h:236

References _tq, FLA_Obj_view::base, FLASH_Task_s::fla_arg, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_push_input(), FLASH_Queue_push_output(), FLASH_Task_alloc(), FLASH_Queue_s::head, i, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_macro_args, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_task, FLASH_Task_s::order, FLASH_Task_s::output_arg, FLASH_Task_s::prev_task, FLASH_Task_s::queue, FLASH_Queue_s::tail, and FLA_Obj_struct::write_task.

◆ FLASH_Queue_push_input()

void FLASH_Queue_push_input ( FLA_Obj  obj,
FLASH_Task t 
)
774{
775 FLASH_Task* task;
776 FLASH_Dep* d;
777
778 // Find dependence information.
779 if ( obj.base->write_task == NULL )
780 {
781 t->n_ready--;
782
783 // Add to number of blocks read if not written and not read before.
784 if ( obj.base->n_read_tasks == 0 )
785 {
786 // Identify each read block with an id for freeing.
787 obj.base->n_read_blocks = flash_queue_n_read_blocks;
788
789 flash_queue_n_read_blocks++;
790 }
791 }
792 else
793 { // Flow dependence.
794 task = obj.base->write_task;
795
796 d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
797
798 d->task = t;
799 d->next_dep = NULL;
800
801 if ( task->n_dep_args == 0 )
802 {
803 task->dep_arg_head = d;
804 task->dep_arg_tail = d;
805 }
806 else
807 {
808 task->dep_arg_tail->next_dep = d;
809 task->dep_arg_tail = d;
810 }
811
812 task->n_dep_args++;
813 }
814
815 // Add task to the read task in the object if not already there.
816 if ( obj.base->n_read_tasks == 0 ||
817 obj.base->read_task_tail->task != t )
818 { // Anti-dependence potentially.
819 d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
820
821 d->task = t;
822 d->next_dep = NULL;
823
824 if ( obj.base->n_read_tasks == 0 )
825 {
826 obj.base->read_task_head = d;
827 obj.base->read_task_tail = d;
828 }
829 else
830 {
831 obj.base->read_task_tail->next_dep = d;
832 obj.base->read_task_tail = d;
833 }
834
835 obj.base->n_read_tasks++;
836 }
837
838 return;
839}
int n_dep_args
Definition FLA_type_defs.h:231
FLASH_Dep * dep_arg_head
Definition FLA_type_defs.h:232
FLASH_Dep * dep_arg_tail
Definition FLA_type_defs.h:233
FLASH_Dep * read_task_tail
Definition FLA_type_defs.h:151
FLASH_Dep * read_task_head
Definition FLA_type_defs.h:150
int n_read_blocks
Definition FLA_type_defs.h:145
FLASH_Task * write_task
Definition FLA_type_defs.h:154
int n_read_tasks
Definition FLA_type_defs.h:149

References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::dep_arg_tail, FLA_malloc(), FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_read_blocks, FLA_Obj_struct::n_read_tasks, FLASH_Task_s::n_ready, FLASH_Dep_s::next_dep, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, FLASH_Dep_s::task, and FLA_Obj_struct::write_task.

Referenced by FLASH_Queue_push().

◆ FLASH_Queue_push_output()

void FLASH_Queue_push_output ( FLA_Obj  obj,
FLASH_Task t 
)
849{
850 int i;
851 FLASH_Task* task;
852 FLASH_Dep* d;
853 FLASH_Dep* next_dep;
854
855 // Assign tasks to threads with data affinity.
856 if ( obj.base->write_task == NULL )
857 {
858 t->n_ready--;
859
860 // Save index in which this output block is first encountered.
861 obj.base->n_write_blocks = flash_queue_n_write_blocks;
862
863 // Number of blocks written if not written before.
864 flash_queue_n_write_blocks++;
865
866 // Add to number of blocks read if not written or read before.
867 if ( obj.base->n_read_tasks == 0 )
868 {
869 // Identify each read block with an id for freeing.
870 obj.base->n_read_blocks = flash_queue_n_read_blocks;
871
872 flash_queue_n_read_blocks++;
873 }
874 }
875 else
876 { // Flow dependence potentially.
877 // The last task to overwrite this block is not itself.
878 if ( obj.base->write_task != t )
879 {
880 // Create dependency from task that last wrote the block.
881 task = obj.base->write_task;
882
883 d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
884
885 d->task = t;
886 d->next_dep = NULL;
887
888 if ( task->n_dep_args == 0 )
889 {
890 task->dep_arg_head = d;
891 task->dep_arg_tail = d;
892 }
893 else
894 {
895 task->dep_arg_tail->next_dep = d;
896 task->dep_arg_tail = d;
897 }
898
899 task->n_dep_args++;
900 }
901 else
902 {
903 // No need to notify task twice for output block already seen.
904 t->n_ready--;
905 }
906 }
907
908 // Clear read task for next set of reads and record the anti-dependence.
909 d = obj.base->read_task_head;
910
911 for ( i = 0; i < obj.base->n_read_tasks; i++ )
912 {
913 task = d->task;
914 next_dep = d->next_dep;
915
916 // If the last task to read is not the current task, add dependence.
917 if ( task != t )
918 {
919 d->task = t;
920 d->next_dep = NULL;
921
922 if ( task->n_dep_args == 0 )
923 {
924 task->dep_arg_head = d;
925 task->dep_arg_tail = d;
926 }
927 else
928 {
929 task->dep_arg_tail->next_dep = d;
930 task->dep_arg_tail = d;
931 }
932
933 task->n_dep_args++;
934
935 t->n_war_args++;
936 }
937 else
938 {
939 FLA_free( d );
940 }
941
942 d = next_dep;
943 }
944
945 obj.base->n_read_tasks = 0;
946 obj.base->read_task_head = NULL;
947 obj.base->read_task_tail = NULL;
948
949 // Record this task as the last to write to this block.
950 obj.base->write_task = t;
951
952 return;
953}
int n_write_blocks
Definition FLA_type_defs.h:146

References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::dep_arg_tail, FLA_free(), FLA_malloc(), i, FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_read_blocks, FLA_Obj_struct::n_read_tasks, FLASH_Task_s::n_ready, FLASH_Task_s::n_war_args, FLA_Obj_struct::n_write_blocks, FLASH_Dep_s::next_dep, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, FLASH_Dep_s::task, and FLA_Obj_struct::write_task.

Referenced by FLASH_Queue_push().

◆ FLASH_Queue_reset()

void FLASH_Queue_reset ( void  )
589{
590 // Clear the other fields of the FLASH_Queue structure.
591 _tq.n_tasks = 0;
592 _tq.head = NULL;
593 _tq.tail = NULL;
594
595 // Reset the number of blocks.
596 flash_queue_n_read_blocks = 0;
597 flash_queue_n_write_blocks = 0;
598
599 return;
600}

References _tq, FLASH_Queue_s::head, FLASH_Queue_s::n_tasks, and FLASH_Queue_s::tail.

Referenced by FLASH_Queue_exec(), and FLASH_Queue_init().

◆ FLASH_Queue_set_block_size()

void FLASH_Queue_set_block_size ( dim_t  size)
467{
468 // Only adjust the block size if the new block is larger.
469 if ( flash_queue_block_size < size )
470 flash_queue_block_size = size;
471
472 return;
473}

Referenced by FLASH_Obj_create_hierarchy().

◆ FLASH_Queue_set_cache_line_size()

void FLASH_Queue_set_cache_line_size ( dim_t  size)
517{
518 flash_queue_cache_line_size = size;
519
520 return;
521}

◆ FLASH_Queue_set_cache_size()

void FLASH_Queue_set_cache_size ( dim_t  size)
493{
494 flash_queue_cache_size = size;
495
496 return;
497}

◆ FLASH_Queue_set_caching()

void FLASH_Queue_set_caching ( FLA_Bool  caching)
349{
350 flash_queue_caching = caching;
351
352 return;
353}

Referenced by FLASH_Queue_exec().

◆ FLASH_Queue_set_cores_per_cache()

void FLASH_Queue_set_cores_per_cache ( int  cores)
541{
542 flash_queue_cores_per_cache = cores;
543
544 return;
545}

◆ FLASH_Queue_set_cores_per_queue()

void FLASH_Queue_set_cores_per_queue ( int  cores)
565{
566 flash_queue_cores_per_queue = cores;
567
568 return;
569}

◆ FLASH_Queue_set_data_affinity()

void FLASH_Queue_set_data_affinity ( FLASH_Data_aff  data_affinity)
397{
398 flash_queue_data_affinity = data_affinity;
399
400 return;
401}

Referenced by FLASH_Queue_exec().

◆ FLASH_Queue_set_num_threads()

void FLASH_Queue_set_num_threads ( unsigned int  n_threads)
193{
195
196 // Verify that the number of threads is positive.
199
200 // Keep track of the number of threads internally.
201 flash_queue_n_threads = n_threads;
202
203#if FLA_MULTITHREADING_MODEL == FLA_OPENMP
204
205 // No additional action is necessary to set the number of OpenMP threads
206 // since setting the number of threads is handled at the parallel for loop
207 // with a num_threads() clause. This gives the user more flexibility since
208 // he can use the OMP_NUM_THREADS environment variable or the
209 // omp_set_num_threads() function to set the global number of OpenMP threads
210 // independently of the number of SuperMatrix threads.
211
212#elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
213
214 // No additional action is necessary to set the number of pthreads
215 // since setting the number of threads is handled entirely on our end.
216
217#endif
218
219 return;
220}
FLA_Error FLA_Check_num_threads(unsigned int n_threads)
Definition FLA_Check.c:884

References FLA_Check_num_threads().

◆ FLASH_Queue_set_parallel_time()

void FLASH_Queue_set_parallel_time ( double  dtime)
454{
455 flash_queue_parallel_time = dtime;
456
457 return;
458}

Referenced by FLASH_Queue_exec().

◆ FLASH_Queue_set_sorting()

void FLASH_Queue_set_sorting ( FLA_Bool  sorting)
325{
326 flash_queue_sorting = sorting;
327
328 return;
329}

◆ FLASH_Queue_set_verbose_output()

void FLASH_Queue_set_verbose_output ( FLASH_Verbose  verbose)
301{
302 flash_queue_verbose = verbose;
303
304 return;
305}

◆ FLASH_Queue_set_work_stealing()

void FLASH_Queue_set_work_stealing ( FLA_Bool  work_stealing)
373{
374 flash_queue_work_stealing = work_stealing;
375
376 return;
377}

Referenced by FLASH_Queue_exec().

◆ FLASH_Queue_stack_depth()

unsigned int FLASH_Queue_stack_depth ( void  )
112{
113 return flash_queue_stack;
114}

Referenced by FLASH_Eig_gest(), FLASH_LU_incpiv(), FLASH_QR_UT_inc(), FLASH_Queue_disable_gpu(), and FLASH_Queue_enable_gpu().

◆ FLASH_Queue_update_block_gpu()

void FLASH_Queue_update_block_gpu ( FLA_Obj  obj,
void **  buffer_gpu,
int  thread,
void arg 
)
1704{
1706 int j, k;
1712
1713#ifdef FLA_ENABLE_MULTITHREADING
1714 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1715#endif
1716
1717 // Locate the position of the block on GPU.
1718 for ( k = 0; k < gpu_n_blocks - 1; k++ )
1719 if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1720 break;
1721
1722 // Save the pointer to the data on the GPU.
1723 buffer_gpu[0] = args->gpu[thread * gpu_n_blocks + k].buffer_gpu;
1724
1725 // Save the victim block.
1726 evict_obj = args->gpu[thread * gpu_n_blocks + k];
1727
1728 // The block is not already in the GPU.
1729 if ( obj.base != args->gpu[thread * gpu_n_blocks + k].obj.base )
1730 {
1731 // Save for data transfer outside of critical section.
1732 transfer = TRUE;
1733
1734 // Save for eviction outside of critical section.
1735 if ( evict_obj.obj.base != NULL && !evict_obj.clean )
1736 {
1737 evict = TRUE;
1738 args->victim[thread] = evict_obj;
1739 }
1740
1741 // Save the block in the data structure.
1742 args->gpu[thread * gpu_n_blocks + k].obj = obj;
1743
1744 // Make sure the new block is clean.
1745 args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
1746 args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1747 }
1748
1749 // Use the block on the GPU that is a hit or LRU.
1750 gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1751
1752 // Shift all the previous tasks for LRU replacement.
1753 for ( j = k; j > 0; j-- )
1754 args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j - 1];
1755
1756 // Place the block on the cache as the most recently used.
1757 args->gpu[thread * gpu_n_blocks] = gpu_obj;
1758
1759#ifdef FLA_ENABLE_MULTITHREADING
1760 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1761#endif
1762
1763 // Evict and flush the LRU dirty block.
1764 if ( evict )
1765 {
1766 FLASH_Queue_read_gpu( evict_obj.obj, evict_obj.buffer_gpu );
1767
1768#ifdef FLA_ENABLE_MULTITHREADING
1769 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1770#endif
1771
1772 args->victim[thread].obj.base = NULL;
1773
1774#ifdef FLA_ENABLE_MULTITHREADING
1775 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1776#endif
1777 }
1778
1779 // Move the block to the GPU.
1780 if ( transfer )
1781 FLASH_Queue_write_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
1782
1783 return;
1784}
FLA_Error FLASH_Queue_write_gpu(FLA_Obj obj, void *buffer_gpu)
Definition FLASH_Queue_gpu.c:185

References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_write_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, i, FLA_Obj_gpu_struct::obj, FLA_Obj_gpu_struct::request, and FLASH_Queue_variables::victim.

Referenced by FLASH_Queue_update_gpu().

◆ FLASH_Queue_update_cache()

void FLASH_Queue_update_cache ( FLASH_Task t,
void arg 
)
853{
854 int i, j;
856 FLA_Obj obj;
857
858 if ( t == NULL )
859 return;
860
861 // Updating the input blocks.
862 for ( i = t->n_input_args - 1; i >= 0; i-- )
863 {
864 // Check for duplicate blocks.
866
867 for ( j = 0; j < t->n_output_args && !duplicate; j++ )
868 {
869 if ( t->input_arg[i].base == t->output_arg[j].base )
870 duplicate = TRUE;
871 }
872
873 for ( j = 0; j < i && !duplicate; j++ )
874 {
875 if ( t->input_arg[i].base == t->input_arg[j].base )
876 duplicate = TRUE;
877 }
878
879 // If the input block has not been processed before.
880 if ( !duplicate )
881 {
882 obj = t->input_arg[i];
883
884 // Macroblock is used.
885 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
886 {
887 dim_t jj, kk;
888 dim_t m = FLA_Obj_length( obj );
889 dim_t n = FLA_Obj_width( obj );
890 dim_t cs = FLA_Obj_col_stride( obj );
891 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
892
893 // Dependence analysis for each input block in macroblock.
894 for ( jj = 0; jj < n; jj++ )
895 for ( kk = 0; kk < m; kk++ )
896 FLASH_Queue_update_cache_block( *( buf + jj * cs + kk ),
897 t->cache, FALSE, arg );
898 }
899 else // Regular block.
900 {
901 FLASH_Queue_update_cache_block( obj, t->cache, FALSE, arg );
902 }
903 }
904 }
905
906 // Updating the output blocks.
907 for ( i = t->n_output_args - 1; i >= 0; i-- )
908 {
909 // Check for duplicate blocks.
911
912 for ( j = 0; j < i && !duplicate; j++ )
913 {
914 if ( t->output_arg[i].base == t->output_arg[j].base )
915 duplicate = TRUE;
916 }
917
918 // If the output block has not been processed before.
919 if ( !duplicate )
920 {
921 obj = t->output_arg[i];
922
923 // Macroblock is used.
924 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
925 {
926 dim_t jj, kk;
927 dim_t m = FLA_Obj_length( obj );
928 dim_t n = FLA_Obj_width( obj );
929 dim_t cs = FLA_Obj_col_stride( obj );
930 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
931
932 // Dependence analysis for each input block in macroblock.
933 for ( jj = 0; jj < n; jj++ )
934 for ( kk = 0; kk < m; kk++ )
936 t->cache, TRUE, arg );
937 }
938 else // Regular block.
939 {
940 FLASH_Queue_update_cache_block( obj, t->cache, TRUE, arg );
941 }
942 }
943 }
944
945 return;
946}
void FLASH_Queue_update_cache_block(FLA_Obj obj, int cache, FLA_Bool output, void *arg)
Definition FLASH_Queue_exec.c:949

References FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_update_cache_block(), and i.

Referenced by FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().

◆ FLASH_Queue_update_cache_block()

void FLASH_Queue_update_cache_block ( FLA_Obj  obj,
int  cache,
FLA_Bool  output,
void arg 
)
958{
960 int i, j, k;
961 int n_caches = args->n_caches;
962 int size = args->size;
963
964#ifdef FLA_ENABLE_MULTITHREADING
965 FLA_Lock_acquire( &(args->cac_lock[cache]) ); // C ***
966#endif
967
968 // Locate the position of the block in the cache.
969 for ( k = 0; k < size - 1; k++ )
970 {
971 if ( obj.base == args->cache[cache * size + k].base )
972 break;
973 }
974
975 // Shift all the previous tasks for LRU replacement.
976 for ( j = k; j > 0; j-- )
977 args->cache[cache * size + j] = args->cache[cache * size + j - 1];
978
979 // Place the block on the cache as the most recently used.
980 args->cache[cache * size] = obj;
981
982#ifdef FLA_ENABLE_MULTITHREADING
983 FLA_Lock_release( &(args->cac_lock[cache]) ); // C ***
984#endif
985
986 // Write invalidate if updating with output block.
987 if ( output )
988 {
989 for ( i = 0; i < n_caches; i++ )
990 {
991 if ( i != cache )
992 {
993#ifdef FLA_ENABLE_MULTITHREADING
994 FLA_Lock_acquire( &(args->cac_lock[i]) ); // C ***
995#endif
996 // Locate the position of the block in the cache.
997 for ( k = 0; k < size; k++ )
998 {
999 if ( obj.base == args->cache[i * size + k].base )
1000 break;
1001 }
1002
1003 // The block is owned by other thread.
1004 if ( k < size )
1005 {
1006 // Shift all the blocks for the invalidated block.
1007 for ( j = k; j < size - 1; j++ )
1008 args->cache[i * size + j] = args->cache[i * size + j + 1];
1009
1010 // Invalidate the block.
1011 args->cache[i * size + size - 1].base = NULL;
1012 }
1013#ifdef FLA_ENABLE_MULTITHREADING
1014 FLA_Lock_release( &(args->cac_lock[i]) ); // C ***
1015#endif
1016 }
1017 }
1018 }
1019
1020 return;
1021}
FLA_Lock * cac_lock
Definition FLASH_Queue_exec.c:74
int n_caches
Definition FLASH_Queue_exec.c:80

References FLA_Obj_view::base, FLASH_Queue_variables::cac_lock, FLASH_Queue_variables::cache, FLA_Lock_acquire(), FLA_Lock_release(), i, FLASH_Queue_variables::n_caches, and FLASH_Queue_variables::size.

Referenced by FLASH_Queue_update_cache().

◆ FLASH_Queue_update_gpu()

void FLASH_Queue_update_gpu ( FLASH_Task t,
void **  input_arg,
void **  output_arg,
void arg 
)
1599{
1600 int i, j, k;
1601 int thread = t->thread;
1604
1605 // None of the arguments can be macroblocks yet.
1606 // Complicating factor is copying macroblock to contiguous memory on GPU.
1607
1608 // Bring the input arguments to the GPU.
1609 for ( i = t->n_input_args - 1; i >= 0; i-- )
1610 {
1611 // Check for duplicate blocks.
1612 duplicate = FALSE;
1613
1614 for ( j = 0; j < t->n_output_args && !duplicate; j++ )
1615 {
1616 if ( t->input_arg[i].base == t->output_arg[j].base )
1617 duplicate = TRUE;
1618 }
1619
1620 for ( j = 0; j < i && !duplicate; j++ )
1621 {
1622 if ( t->input_arg[i].base == t->input_arg[j].base )
1623 duplicate = TRUE;
1624 }
1625
1626 // If the input block has not been processed before.
1627 if ( !duplicate )
1628 {
1629 FLASH_Queue_update_block_gpu( t->input_arg[i], input_arg + i, thread, arg );
1630 }
1631 else
1632 {
1633 input_arg[i] = NULL;
1634 }
1635 }
1636
1637 // Bring the output arguments to the GPU.
1638 for ( i = t->n_output_args - 1; i >= 0; i-- )
1639 {
1640 // Check for duplicate blocks.
1641 duplicate = FALSE;
1642
1643 for ( j = 0; j < i && !duplicate; j++ )
1644 {
1645 if ( t->output_arg[i].base == t->output_arg[j].base )
1646 duplicate = TRUE;
1647 }
1648
1649 // If the output block has not been processed before.
1650 if ( !duplicate )
1651 {
1652 FLASH_Queue_update_block_gpu( t->output_arg[i], output_arg + i, thread, arg );
1653
1654 // Invalidate output blocks on all other GPUs.
1655 for ( k = 0; k < n_threads; k++ )
1656 if ( k != thread )
1657 FLASH_Queue_invalidate_block_gpu( t->output_arg[i], k, arg );
1658 }
1659 else
1660 {
1661 output_arg[i] = NULL;
1662 }
1663 }
1664
1665 // Check to see if there are any duplicates.
1666 for ( i = t->n_input_args - 1; i >= 0; i-- )
1667 {
1668 for ( j = 0; j < t->n_output_args && input_arg[i] == NULL; j++ )
1669 {
1670 if ( t->input_arg[i].base == t->output_arg[j].base )
1671 input_arg[i] = output_arg[j];
1672 }
1673
1674 for ( j = 0; j < i && input_arg[i] == NULL; j++ )
1675 {
1676 if ( t->input_arg[i].base == t->input_arg[j].base )
1677 input_arg[i] = input_arg[j];
1678 }
1679 }
1680
1681 // Check to see if there are any duplicates.
1682 for ( i = t->n_output_args - 1; i >= 0; i-- )
1683 {
1684 for ( j = 0; j < i && output_arg[i] == NULL; j++ )
1685 {
1686 if ( t->output_arg[i].base == t->output_arg[j].base )
1687 output_arg[i] = output_arg[j];
1688 }
1689 }
1690
1691 return;
1692}
void FLASH_Queue_update_block_gpu(FLA_Obj obj, void **buffer_gpu, int thread, void *arg)
Definition FLASH_Queue_exec.c:1695

References FLASH_Queue_get_num_threads(), FLASH_Queue_invalidate_block_gpu(), FLASH_Queue_update_block_gpu(), and i.

Referenced by FLASH_Queue_exec_gpu().

◆ FLASH_Queue_verbose_output()

void FLASH_Queue_verbose_output ( void  )
1788{
1789 int i, j, k;
1791 int n_tasks = FLASH_Queue_get_num_tasks();
1793 FLASH_Task* t;
1794 FLASH_Dep* d;
1795
1796 // Grab the head of the task queue.
1798
1800 {
1801 // Iterate over linked list of tasks.
1802 for ( i = 0; i < n_tasks; i++ )
1803 {
1804 printf( "%d\t%s\t", t->order, t->name );
1805
1806 for ( j = 0; j < t->n_output_args; j++ )
1807 printf( "%lu[%lu,%lu] ", t->output_arg[j].base->id,
1808 t->output_arg[j].base->m_index,
1809 t->output_arg[j].base->n_index );
1810
1811 printf( ":= " );
1812
1813 for ( j = 0; j < t->n_output_args; j++ )
1814 printf( "%lu[%lu,%lu] ", t->output_arg[j].base->id,
1815 t->output_arg[j].base->m_index,
1816 t->output_arg[j].base->n_index );
1817
1818 for ( j = 0; j < t->n_input_args; j++ )
1819 printf( "%lu[%lu,%lu] ", t->input_arg[j].base->id,
1820 t->input_arg[j].base->m_index,
1821 t->input_arg[j].base->n_index );
1822
1823 printf( "\n" );
1824
1825 // Go to the next task.
1826 t = t->next_task;
1827 }
1828
1829 printf( "\n" );
1830 }
1831 else
1832 {
1833 printf( "digraph SuperMatrix {\n" );
1834
1836 {
1837 // Iterate over linked list of tasks.
1838 for ( i = 0; i < n_tasks; i++ )
1839 {
1840 printf( "%d [label=\"%s\"]; %d -> {", t->order, t->name, t->order);
1841
1842 d = t->dep_arg_head;
1843 for ( j = 0; j < t->n_dep_args; j++ )
1844 {
1845 printf( "%d;", d->task->order );
1846 d = d->next_dep;
1847 }
1848
1849 printf( "};\n" );
1850
1851 // Go to the next task.
1852 t = t->next_task;
1853 }
1854 }
1855 else
1856 {
1857 // Iterate over all the threads.
1858 for ( k = 0; k < n_threads; k++ )
1859 {
1860 printf( "subgraph cluster%d {\nlabel=\"%d\"\n", k, k );
1861
1862 // Iterate over linked list of tasks.
1863 for ( i = 0; i < n_tasks; i++ )
1864 {
1865 if ( t->queue == k )
1866 printf( "%d [label=\"%s\"];\n", t->order, t->name );
1867
1868 // Go to the next task.
1869 t = t->next_task;
1870 }
1871
1872 printf( "}\n" );
1873
1874 // Grab the head of the task queue.
1876 }
1877
1878 // Iterate over linked list of tasks.
1879 for ( i = 0; i < n_tasks; i++ )
1880 {
1881 printf( "%d -> {", t->order );
1882
1883 d = t->dep_arg_head;
1884 for ( j = 0; j < t->n_dep_args; j++ )
1885 {
1886 printf( "%d;", d->task->order );
1887 d = d->next_dep;
1888 }
1889
1890 printf( "};\n" );
1891
1892 // Go to the next task.
1893 t = t->next_task;
1894 }
1895 }
1896
1897 printf( "}\n\n" );
1898 }
1899
1900 return;
1901}
FLASH_Verbose FLASH_Queue_get_verbose_output(void)
Definition FLASH_Queue.c:308
FLASH_Task * FLASH_Queue_get_head_task(void)
Definition FLASH_Queue.c:603
FLASH_Data_aff FLASH_Queue_get_data_affinity(void)
Definition FLASH_Queue.c:404
unsigned int FLASH_Queue_get_num_threads(void)
Definition FLASH_Queue.c:223
unsigned int FLASH_Queue_get_num_tasks(void)
Definition FLASH_Queue.c:284

References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Queue_get_data_affinity(), FLASH_Queue_get_head_task(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), i, FLA_Obj_struct::id, FLASH_Task_s::input_arg, FLA_Obj_struct::m_index, FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_index, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::name, FLASH_Dep_s::next_dep, FLASH_Task_s::next_task, FLASH_Task_s::order, FLASH_Task_s::output_arg, FLASH_Task_s::queue, and FLASH_Dep_s::task.

Referenced by FLASH_Queue_exec().

◆ FLASH_Queue_wait_dequeue()

FLASH_Task * FLASH_Queue_wait_dequeue ( int  queue,
int  cache,
void arg 
)

◆ FLASH_Queue_wait_dequeue_block()

FLASH_Task * FLASH_Queue_wait_dequeue_block ( int  queue,
int  cache,
void arg 
)
785{
787 int i, j, k;
788 int size = args->size;
789 int n_tasks = args->wait_queue[queue].n_tasks;
791 FLASH_Task* t;
792 FLA_Obj obj;
793 FLA_Obj mem;
794
795#ifdef FLA_ENABLE_GPU
797
798 // If using GPUs, then only check GPU and not the cache.
799 if ( enabled )
801#endif
802
803 t = args->wait_queue[queue].head;
804
805 // Check if any of the output blocks are in the cache.
806 for ( i = 0; i < n_tasks; i++ )
807 {
808 for ( j = 0; j < size; j++ )
809 {
810 // Initialize the memory just in case.
811 mem.base = NULL;
812
813 // Determine if using GPU or not.
814 if ( enabled )
815 {
816#ifdef FLA_ENABLE_GPU
817 mem = args->gpu[cache * size + j].obj;
818#endif
819 }
820 else
821 {
822 mem = args->cache[cache * size + j];
823 }
824
825 for ( k = 0; k < t->n_output_args; k++ )
826 {
827 obj = t->output_arg[k];
828
829 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
830 obj = *FLASH_OBJ_PTR_AT( obj );
831
832 // Return the task if its output block is in cache.
833 if ( mem.base == obj.base )
834 {
835 t->hit = TRUE;
836 return t;
837 }
838 }
839 }
840 t = t->next_wait;
841 }
842
843 return args->wait_queue[queue].head;
844}

References FLA_Obj_view::base, FLASH_Queue_variables::cache, FLA_Obj_elemtype(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_s::head, i, FLASH_Queue_s::n_tasks, FLA_Obj_gpu_struct::obj, FLASH_Queue_variables::size, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_wait_dequeue().

◆ FLASH_Queue_wait_enqueue()

void FLASH_Queue_wait_enqueue ( FLASH_Task t,
void arg 
)
2930{
2932 int i = args->n_wait[0] + args->pc[0];
2933
2934 // Insertion sort of tasks in waiting queue.
2936 {
2937 for ( ; i > args->pc[0]; i-- )
2938 {
2939 if ( args->task_queue[args->wait_queue[i-1]]->height >
2940 args->task_queue[t->order]->height )
2941 break;
2942
2943 args->wait_queue[i] = args->wait_queue[i-1];
2944 }
2945 }
2946
2947 args->wait_queue[i] = t->order;
2948
2949 // Increment number of tasks on waiting queue.
2950 args->n_wait[0]++;
2951
2952 return;
2953}
FLA_Bool FLASH_Queue_get_sorting(void)
Definition FLASH_Queue.c:332

References FLASH_Queue_get_sorting(), FLASH_Queue_s::head, FLASH_Task_s::height, i, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLASH_Task_s::next_wait, FLASH_Queue_variables::pc, FLASH_Task_s::prev_wait, FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_exec_gpu(), FLASH_Queue_exec_simulation(), FLASH_Queue_init_tasks(), FLASH_Task_update_binding(), and FLASH_Task_update_dependencies().

◆ FLASH_Queue_work_stealing()

FLASH_Task * FLASH_Queue_work_stealing ( int  queue,
void arg 
)
1161{
1163 int q;
1164 int n_queues = args->n_queues;
1165 FLASH_Task* t = NULL;
1166
1167 // Do not perform work stealing if there is only one queue.
1168 if ( n_queues == 1 )
1169 return t;
1170
1171 // Find a random queue not equal to the current queue.
1172 do
1173 {
1174#ifdef FLA_ENABLE_WINDOWS_BUILD
1175 rand_s( &q );
1176 q = q % n_queues;
1177#else
1178#ifdef FLA_ENABLE_TIDSP
1179 q = rand() % n_queues;
1180#else
1181 q = lrand48() % n_queues;
1182#endif
1183#endif
1184 }
1185 while ( q == queue );
1186
1187#ifdef FLA_ENABLE_MULTITHREADING
1188 FLA_Lock_acquire( &(args->run_lock[q]) ); // R ***
1189#endif
1190
1191 // If there are tasks that this thread can steal.
1192 if ( args->wait_queue[q].n_tasks > 0 )
1193 {
1194 // Dequeue the last task.
1195 t = args->wait_queue[q].tail;
1196
1197 if ( args->wait_queue[q].n_tasks == 1 )
1198 {
1199 // Clear the queue of its only task.
1200 args->wait_queue[q].head = NULL;
1201 args->wait_queue[q].tail = NULL;
1202 }
1203 else
1204 {
1205 // Adjust pointers in waiting queue.
1206 args->wait_queue[q].tail = t->prev_wait;
1207 args->wait_queue[q].tail->next_wait = NULL;
1208 }
1209
1210 // Reset waiting queue data about the stolen task.
1211 t->queue = queue;
1212 t->prev_wait = NULL;
1213 t->next_wait = NULL;
1214
1215 args->wait_queue[q].n_tasks--;
1216 }
1217
1218#ifdef FLA_ENABLE_MULTITHREADING
1219 FLA_Lock_release( &(args->run_lock[q]) ); // R ***
1220#endif
1221
1222 return t;
1223}
FLASH_Task * prev_wait
Definition FLA_type_defs.h:240
FLASH_Task * next_wait
Definition FLA_type_defs.h:241

References FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_s::head, i, FLASH_Queue_variables::n_queues, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_wait, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, FLASH_Queue_variables::run_lock, FLASH_Queue_s::tail, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_exec_parallel_function().

◆ FLASH_Task_alloc()

FLASH_Task * FLASH_Task_alloc ( void func,
void cntl,
char name,
FLA_Bool  enabled_gpu,
int  n_int_args,
int  n_fla_args,
int  n_input_args,
int  n_output_args 
)
969{
970 FLASH_Task* t;
971
972 // Allocate space for the task structure t.
973 t = (FLASH_Task *) FLA_malloc( sizeof(FLASH_Task) );
974
975 // Allocate space for the task's integer arguments.
976 t->int_arg = (int *) FLA_malloc( n_int_args * sizeof(int) );
977
978 // Allocate space for the task's FLA_Obj arguments.
979 t->fla_arg = (FLA_Obj *) FLA_malloc( n_fla_args * sizeof(FLA_Obj) );
980
981 // Allocate space for the task's input FLA_Obj arguments.
982 t->input_arg = (FLA_Obj *) FLA_malloc( n_input_args * sizeof(FLA_Obj) );
983
984 // Allocate space for the task's output FLA_Obj arguments.
985 t->output_arg = (FLA_Obj *) FLA_malloc( n_output_args * sizeof(FLA_Obj) );
986
987 // Initialize other fields of the structure.
988 t->n_ready = 0;
989 t->order = 0;
990 t->queue = 0;
991 t->height = 0;
992 t->thread = 0;
993 t->cache = 0;
994 t->hit = FALSE;
995
996 t->func = func;
997 t->cntl = cntl;
998 t->name = name;
999 t->enabled_gpu = enabled_gpu;
1000 t->n_int_args = n_int_args;
1001 t->n_fla_args = n_fla_args;
1002 t->n_input_args = n_input_args;
1003 t->n_output_args = n_output_args;
1004
1005 t->n_macro_args = 0;
1006 t->n_war_args = 0;
1007 t->n_dep_args = 0;
1008 t->dep_arg_head = NULL;
1009 t->dep_arg_tail = NULL;
1010 t->prev_task = NULL;
1011 t->next_task = NULL;
1012 t->prev_wait = NULL;
1013 t->next_wait = NULL;
1014
1015 // Return a pointer to the initialized structure.
1016 return t;
1017}
int * int_arg
Definition FLA_type_defs.h:210

References FLASH_Task_s::cache, FLASH_Task_s::cntl, FLASH_Task_s::dep_arg_head, FLASH_Task_s::dep_arg_tail, FLASH_Task_s::enabled_gpu, FLASH_Task_s::fla_arg, FLA_malloc(), FLASH_Task_s::func, FLASH_Task_s::height, FLASH_Task_s::hit, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_fla_args, FLASH_Task_s::n_input_args, FLASH_Task_s::n_int_args, FLASH_Task_s::n_macro_args, FLASH_Task_s::n_output_args, FLASH_Task_s::n_ready, FLASH_Task_s::n_war_args, FLASH_Task_s::name, FLASH_Task_s::next_task, FLASH_Task_s::next_wait, FLASH_Task_s::order, FLASH_Task_s::output_arg, FLASH_Task_s::prev_task, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_push().

◆ FLASH_Task_free()

void FLASH_Task_free ( FLASH_Task t)
1026{
1027 int i, j, k;
1028 FLA_Obj obj;
1029 FLASH_Dep* d;
1030 FLASH_Dep* next_dep;
1031
1032 // Clearing the last write task in each output block.
1033 for ( i = 0; i < t->n_output_args; i++ )
1034 {
1035 obj = t->output_arg[i];
1036
1037 // Macroblock is used.
1038 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
1039 {
1040 dim_t jj, kk;
1041 dim_t m = FLA_Obj_length( obj );
1042 dim_t n = FLA_Obj_width( obj );
1043 dim_t cs = FLA_Obj_col_stride( obj );
1044 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
1045
1046 // Clear each block in macroblock.
1047 for ( jj = 0; jj < n; jj++ )
1048 for ( kk = 0; kk < m; kk++ )
1049 ( buf + jj * cs + kk )->base->write_task = NULL;
1050 }
1051 else // Clear regular block.
1052 {
1053 obj.base->write_task = NULL;
1054 }
1055 }
1056
1057 // Cleaning the last read tasks in each input block.
1058 for ( i = 0; i < t->n_input_args; i++ )
1059 {
1060 obj = t->input_arg[i];
1061
1062 // Macroblock is used.
1063 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
1064 {
1065 dim_t jj, kk;
1066 dim_t m = FLA_Obj_length( obj );
1067 dim_t n = FLA_Obj_width( obj );
1068 dim_t cs = FLA_Obj_col_stride( obj );
1069 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
1070
1071 // Clear each block in macroblock.
1072 for ( jj = 0; jj < n; jj++ )
1073 {
1074 for ( kk = 0; kk < m; kk++ )
1075 {
1076 obj = *( buf + jj * cs + kk );
1077
1078 k = obj.base->n_read_tasks;
1079 d = obj.base->read_task_head;
1080
1081 obj.base->n_read_tasks = 0;
1082 obj.base->read_task_head = NULL;
1083 obj.base->read_task_tail = NULL;
1084
1085 for ( j = 0; j < k; j++ )
1086 {
1087 next_dep = d->next_dep;
1088 FLA_free( d );
1089 d = next_dep;
1090 }
1091 }
1092 }
1093 }
1094 else // Regular block.
1095 {
1096 k = obj.base->n_read_tasks;
1097 d = obj.base->read_task_head;
1098
1099 obj.base->n_read_tasks = 0;
1100 obj.base->read_task_head = NULL;
1101 obj.base->read_task_tail = NULL;
1102
1103 for ( j = 0; j < k; j++ )
1104 {
1105 next_dep = d->next_dep;
1106 FLA_free( d );
1107 d = next_dep;
1108 }
1109 }
1110 }
1111
1112 // Free the dep_arg field of t.
1113 d = t->dep_arg_head;
1114
1115 for ( i = 0; i < t->n_dep_args; i++ )
1116 {
1117 next_dep = d->next_dep;
1118 FLA_free( d );
1119 d = next_dep;
1120 }
1121
1122 // Free the int_arg field of t.
1123 FLA_free( t->int_arg );
1124
1125 // Free the fla_arg field of t.
1126 FLA_free( t->fla_arg );
1127
1128 // Free the input_arg field of t.
1129 FLA_free( t->input_arg );
1130
1131 // Free the output_arg field of t.
1132 FLA_free( t->output_arg );
1133
1134 // Finally, free the struct itself.
1135 FLA_free( t );
1136
1137 return;
1138}

References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::fla_arg, FLA_free(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), i, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLA_Obj_struct::n_read_tasks, FLASH_Dep_s::next_dep, FLASH_Task_s::output_arg, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, and FLA_Obj_struct::write_task.

Referenced by FLASH_Queue_exec(), and FLASH_Queue_exec_simulation().

◆ FLASH_Task_free_parallel()

void FLASH_Task_free_parallel ( FLASH_Task t,
void arg 
)
2452{
2454 int i, j, k;
2455 int thread;
2457 FLASH_Dep* d;
2458 FLASH_Dep* next_dep;
2459 FLA_Obj obj;
2460
2461 // Clearing the last write task in each output block.
2462 for ( i = 0; i < t->n_output_args; i++ )
2463 {
2464 obj = t->output_arg[i];
2465
2466 // Macroblock is used.
2467 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
2468 {
2469 dim_t jj, kk;
2470 dim_t m = FLA_Obj_length( obj );
2471 dim_t n = FLA_Obj_width( obj );
2472 dim_t cs = FLA_Obj_col_stride( obj );
2473 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
2474
2475 // Clear each block in macroblock.
2476 for ( jj = 0; jj < n; jj++ )
2477 for ( kk = 0; kk < m; kk++ )
2478 ( buf + jj * cs + kk )->base->write_task = NULL;
2479 }
2480 else // Clear regular block.
2481 {
2482 obj.base->write_task = NULL;
2483 }
2484 }
2485
2486 // Cleaning the last read tasks in each input block.
2487 for ( i = 0; i < t->n_input_args; i++ )
2488 {
2489 obj = t->input_arg[i];
2490
2491 // Macroblock is used.
2492 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
2493 {
2494 dim_t jj, kk;
2495 dim_t m = FLA_Obj_length( obj );
2496 dim_t n = FLA_Obj_width( obj );
2497 dim_t cs = FLA_Obj_col_stride( obj );
2498 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
2499
2500 // Clear each block in macroblock.
2501 for ( jj = 0; jj < n; jj++ )
2502 {
2503 for ( kk = 0; kk < m; kk++ )
2504 {
2505 obj = *( buf + jj * cs + kk );
2506
2507 thread = obj.base->n_read_blocks % n_threads;
2508
2509 FLA_Lock_acquire( &(args->war_lock[thread]) ); // W ***
2510
2511 k = obj.base->n_read_tasks;
2512 d = obj.base->read_task_head;
2513
2514 obj.base->n_read_tasks = 0;
2515 obj.base->read_task_head = NULL;
2516 obj.base->read_task_tail = NULL;
2517
2518 FLA_Lock_release( &(args->war_lock[thread]) ); // W ***
2519
2520 for ( j = 0; j < k; j++ )
2521 {
2522 next_dep = d->next_dep;
2523 FLA_free( d );
2524 d = next_dep;
2525 }
2526 }
2527 }
2528 }
2529 else // Regular block.
2530 {
2531 thread = obj.base->n_read_blocks % n_threads;
2532
2533 FLA_Lock_acquire( &(args->war_lock[thread]) ); // W ***
2534
2535 k = obj.base->n_read_tasks;
2536 d = obj.base->read_task_head;
2537
2538 obj.base->n_read_tasks = 0;
2539 obj.base->read_task_head = NULL;
2540 obj.base->read_task_tail = NULL;
2541
2542 FLA_Lock_release( &(args->war_lock[thread]) ); // W ***
2543
2544 for ( j = 0; j < k; j++ )
2545 {
2546 next_dep = d->next_dep;
2547 FLA_free( d );
2548 d = next_dep;
2549 }
2550 }
2551 }
2552
2553 // Free the dep_arg field of t.
2554 d = t->dep_arg_head;
2555
2556 for ( i = 0; i < t->n_dep_args; i++ )
2557 {
2558 next_dep = d->next_dep;
2559 FLA_free( d );
2560 d = next_dep;
2561 }
2562
2563 // Free the int_arg field of t.
2564 FLA_free( t->int_arg );
2565
2566 // Free the fla_arg field of t.
2567 FLA_free( t->fla_arg );
2568
2569 // Free the input_arg field of t.
2570 FLA_free( t->input_arg );
2571
2572 // Free the output_arg field of t.
2573 FLA_free( t->output_arg );
2574
2575 // Finally, free the struct itself.
2576 FLA_free( t );
2577
2578 return;
2579}
FLA_Lock * war_lock
Definition FLASH_Queue_exec.c:70

References FLA_Obj_view::base, FLA_free(), FLA_Lock_acquire(), FLA_Lock_release(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_num_threads(), i, FLA_Obj_struct::n_read_blocks, FLA_Obj_struct::n_read_tasks, FLASH_Dep_s::next_dep, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, FLASH_Queue_variables::war_lock, and FLA_Obj_struct::write_task.

Referenced by FLASH_Queue_exec_parallel_function().

◆ FLASH_Task_update_binding()

FLASH_Task * FLASH_Task_update_binding ( FLASH_Task t,
FLASH_Task r,
void arg 
)
2401{
2403 int queue;
2404
2405 if ( r == NULL )
2406 {
2407 // There are no tasks on waiting queue, so bind the first task.
2408 r = t;
2409 r->hit = TRUE;
2410 }
2411 else
2412 {
2413 // Swap the binded task for the new ready task.
2414 if ( !r->hit || ( FLASH_Queue_get_sorting() && r->height < t->height ) )
2415 {
2416 queue = r->queue;
2417 r->hit = FALSE;
2418
2419 FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
2420
2421 // Place swapped task back onto waiting queue.
2423
2424 FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
2425
2426 // Bind the new ready task.
2427 r = t;
2428 r->hit = TRUE;
2429 }
2430 else // Keep the binded task and enqueue new ready task.
2431 {
2432 queue = t->queue;
2433
2434 FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
2435
2437
2438 FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
2439 }
2440 }
2441
2442 return r;
2443}
FLA_Bool hit
Definition FLA_type_defs.h:194

References FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_sorting(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::height, FLASH_Task_s::hit, i, FLASH_Task_s::queue, and FLASH_Queue_variables::run_lock.

Referenced by FLASH_Task_update_dependencies().

◆ FLASH_Task_update_dependencies()

FLASH_Task * FLASH_Task_update_dependencies ( FLASH_Task t,
void arg 
)
3050{
3052 int i;
3054 int thread;
3056 FLASH_Task* task;
3057 FLASH_Task* r = NULL;
3058 FLASH_Dep* d = t->dep_arg_head;
3059
3060 // Check each dependent task.
3061 for ( i = 0; i < t->n_dep_args; i++ )
3062 {
3063 task = d->task;
3064
3065 // Use the remaining locks except for the first one.
3066 thread = ( n_threads > 1 ? task->order % ( n_threads - 1 ) + 1 : 0 );
3067
3068 RCCE_acquire_lock( thread );
3069
3070 args->n_ready[task->order]--;
3071 available = ( args->n_ready[task->order] == 0 );
3072
3073 RCCE_release_lock( thread );
3074
3075 // Place newly ready tasks on waiting queue.
3076 if ( available )
3077 {
3078 RCCE_acquire_lock( 0 );
3079
3081
3082 RCCE_release_lock( 0 );
3083 }
3084
3085 // Go to the next dep.
3086 d = d->next_dep;
3087 }
3088
3089 return r;
3090}

References FLASH_Queue_variables::dep_lock, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_caching(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_work_stealing(), FLASH_Queue_wait_dequeue(), FLASH_Queue_wait_enqueue(), FLASH_Task_update_binding(), i, FLASH_Task_s::n_ready, FLASH_Queue_variables::n_ready, FLASH_Dep_s::next_dep, FLASH_Task_s::order, FLASH_Task_s::queue, RCCE_acquire_lock(), RCCE_release_lock(), FLASH_Queue_variables::run_lock, and FLASH_Dep_s::task.

Referenced by FLASH_Queue_exec_parallel_function().