libflame revision_anchor
Data Structures | Typedefs | Functions
FLASH_Queue_exec.c File Reference

(r)

Data Structures

struct  FLA_Obj_gpu_struct
 
struct  FLASH_Queue_variables
 

Typedefs

typedef struct FLA_Obj_gpu_struct FLA_Obj_gpu
 
typedef struct FLASH_Queue_variables FLASH_Queue_vars
 

Functions

void FLASH_Queue_exec (void)
 
void FLASH_Queue_init_tasks (void *arg)
 
void FLASH_Queue_wait_enqueue (FLASH_Task *t, void *arg)
 
FLASH_TaskFLASH_Queue_wait_dequeue (int queue, int cache, void *arg)
 
FLASH_TaskFLASH_Queue_wait_dequeue_block (int queue, int cache, void *arg)
 
void FLASH_Queue_update_cache (FLASH_Task *t, void *arg)
 
void FLASH_Queue_update_cache_block (FLA_Obj obj, int cache, FLA_Bool output, void *arg)
 
void FLASH_Queue_prefetch (int cache, void *arg)
 
void FLASH_Queue_prefetch_block (FLA_Obj obj)
 
FLASH_TaskFLASH_Queue_work_stealing (int queue, void *arg)
 
void FLASH_Queue_create_gpu (int thread, void *arg)
 
void FLASH_Queue_destroy_gpu (int thread, void *arg)
 
FLA_Bool FLASH_Queue_exec_gpu (FLASH_Task *t, void *arg)
 
FLA_Bool FLASH_Queue_check_gpu (FLASH_Task *t, void *arg)
 
FLA_Bool FLASH_Queue_check_block_gpu (FLA_Obj obj, int thread, void *arg)
 
void FLASH_Queue_update_gpu (FLASH_Task *t, void **input_arg, void **output_arg, void *arg)
 
void FLASH_Queue_update_block_gpu (FLA_Obj obj, void **buffer_gpu, int thread, void *arg)
 
void FLASH_Queue_mark_gpu (FLASH_Task *t, void *arg)
 
void FLASH_Queue_invalidate_block_gpu (FLA_Obj obj, int thread, void *arg)
 
void FLASH_Queue_flush_block_gpu (FLA_Obj obj, int thread, void *arg)
 
void FLASH_Queue_flush_gpu (int thread, void *arg)
 
void FLASH_Queue_exec_parallel (void *arg)
 
voidFLASH_Queue_exec_parallel_function (void *arg)
 
FLASH_TaskFLASH_Task_update_dependencies (FLASH_Task *t, void *arg)
 
FLASH_TaskFLASH_Task_update_binding (FLASH_Task *t, FLASH_Task *r, void *arg)
 
void FLASH_Task_free_parallel (FLASH_Task *t, void *arg)
 
void FLASH_Queue_exec_simulation (void *arg)
 
int RCCE_acquire_lock (int)
 
int RCCE_release_lock (int)
 
double RCCE_wtime (void)
 
int RCCE_ue (void)
 
void Synch_all ()
 

Typedef Documentation

◆ FLA_Obj_gpu

◆ FLASH_Queue_vars

Function Documentation

◆ FLASH_Queue_check_block_gpu()

FLA_Bool FLASH_Queue_check_block_gpu ( FLA_Obj  obj,
int  thread,
void arg 
)
1552{
1554 int k;
1557
1558#ifdef FLA_ENABLE_MULTITHREADING
1559 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1560#endif
1561
1562 // Locate the position of the block on the GPU.
1563 for ( k = 0; k < gpu_n_blocks; k++ )
1564 if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1565 break;
1566
1567 if ( k < gpu_n_blocks )
1568 {
1569 // Request this block if it is dirty.
1570 if ( !args->gpu[thread * gpu_n_blocks + k].clean )
1571 {
1572 args->gpu[thread * gpu_n_blocks + k].request = TRUE;
1573
1574 r_val = FALSE;
1575 }
1576 }
1577
1578 // Check the victim block.
1579 if ( obj.base == args->victim[thread].obj.base )
1580 r_val = FALSE;
1581
1582#ifdef FLA_ENABLE_MULTITHREADING
1583 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1584#endif
1585
1586 return r_val;
1587}
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition FLASH_Queue_gpu.c:119
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition FLA_Lock.c:58
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition FLA_Lock.c:43
unsigned long dim_t
Definition FLA_type_defs.h:71
int FLA_Bool
Definition FLA_type_defs.h:46
int i
Definition bl1_axmyv2.c:145
Definition FLASH_Queue_exec.c:55
FLA_Obj_gpu * victim
Definition FLASH_Queue_exec.c:107
FLA_Lock * gpu_lock
Definition FLASH_Queue_exec.c:101
FLA_Obj_gpu * gpu
Definition FLASH_Queue_exec.c:104
FLA_Obj obj
Definition FLASH_Queue_exec.c:40
FLA_Bool request
Definition FLASH_Queue_exec.c:49
FLA_Bool clean
Definition FLASH_Queue_exec.c:46
FLA_Base_obj * base
Definition FLA_type_defs.h:168

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, i, FLA_Obj_gpu_struct::obj, FLA_Obj_gpu_struct::request, and FLASH_Queue_variables::victim.

Referenced by FLASH_Queue_check_gpu().

◆ FLASH_Queue_check_gpu()

FLA_Bool FLASH_Queue_check_gpu ( FLASH_Task t,
void arg 
)
1453{
1454 int i, j, k;
1455 int thread = t->thread;
1456 int n_input_args = t->n_input_args;
1457 int n_output_args = t->n_output_args;
1462 FLA_Obj obj;
1463
1464 // Check the input and output arguments on the GPUs.
1465 for ( i = 0; i < n_input_args + n_output_args; i++ )
1466 {
1467 // Check for duplicate blocks.
1468 duplicate = FALSE;
1469
1470 // Find the correct input or output argument.
1471 if ( i < n_input_args )
1472 {
1473 obj = t->input_arg[i];
1474
1475 for ( j = 0; j < n_output_args && !duplicate; j++ )
1476 {
1477 if ( obj.base == t->output_arg[j].base )
1478 duplicate = TRUE;
1479 }
1480
1481 for ( j = 0; j < i && !duplicate; j++ )
1482 {
1483 if ( obj.base == t->input_arg[j].base )
1484 duplicate = TRUE;
1485 }
1486 }
1487 else
1488 {
1489 obj = t->output_arg[i - n_input_args];
1490
1491 for ( j = 0; j < i - n_input_args && !duplicate; j++ )
1492 {
1493 if ( obj.base == t->output_arg[j].base )
1494 duplicate = TRUE;
1495 }
1496 }
1497
1498 // If the block has not been processed before.
1499 if ( !duplicate )
1500 {
1501 // Macroblock is used.
1502 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
1503 {
1504 dim_t jj, kk;
1505 dim_t m = FLA_Obj_length( obj );
1506 dim_t n = FLA_Obj_width( obj );
1507 dim_t cs = FLA_Obj_col_stride( obj );
1508 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
1509
1510 // Clear each block in macroblock.
1511 for ( jj = 0; jj < n; jj++ )
1512 {
1513 for ( kk = 0; kk < m; kk++ )
1514 {
1515 obj = *( buf + jj * cs + kk );
1516
1517 t_val = TRUE;
1518
1519 // Check to see if the block is dirty on another GPU.
1520 for ( k = 0; k < n_threads && t_val; k++ )
1521 if ( k != thread )
1523
1524 r_val = r_val && t_val;
1525 }
1526 }
1527 }
1528 else
1529 {
1530 t_val = TRUE;
1531
1532 // Check to see if the block is dirty on another GPU.
1533 for ( k = 0; k < n_threads && t_val; k++ )
1534 if ( k != thread )
1536
1537 r_val = r_val && t_val;
1538 }
1539 }
1540 }
1541
1542 return r_val;
1543}
FLA_Bool FLASH_Queue_check_block_gpu(FLA_Obj obj, int thread, void *arg)
Definition FLASH_Queue_exec.c:1546
unsigned int FLASH_Queue_get_num_threads(void)
Definition FLASH_Queue.c:223
dim_t FLA_Obj_width(FLA_Obj obj)
Definition FLA_Query.c:123
dim_t FLA_Obj_length(FLA_Obj obj)
Definition FLA_Query.c:116
FLA_Elemtype FLA_Obj_elemtype(FLA_Obj obj)
Definition FLA_Query.c:51
dim_t FLA_Obj_col_stride(FLA_Obj obj)
Definition FLA_Query.c:174
Definition FLA_type_defs.h:159

References FLA_Obj_view::base, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_check_block_gpu(), FLASH_Queue_get_num_threads(), and i.

Referenced by FLASH_Queue_exec_gpu().

◆ FLASH_Queue_create_gpu()

void FLASH_Queue_create_gpu ( int  thread,
void arg 
)
1233{
1235 int i;
1237 dim_t block_size = args->block_size;
1238 FLA_Datatype datatype = args->datatype;
1239
1240 // Exit if not using GPU.
1242 return;
1243
1244 // Bind thread to GPU.
1245 FLASH_Queue_bind_gpu( thread );
1246
1247 // Allocate the memory on the GPU for all the blocks a priori.
1248 for ( i = 0; i < gpu_n_blocks; i++ )
1249 FLASH_Queue_alloc_gpu( block_size, datatype, &(args->gpu[thread * gpu_n_blocks + i].buffer_gpu) );
1250
1251 return;
1252}
FLA_Error FLASH_Queue_bind_gpu(int thread)
Definition FLASH_Queue_gpu.c:133
FLA_Bool FLASH_Queue_get_enabled_gpu(void)
Definition FLASH_Queue_gpu.c:91
FLA_Error FLASH_Queue_alloc_gpu(dim_t size, FLA_Datatype datatype, void **buffer_gpu)
Definition FLASH_Queue_gpu.c:147
int FLA_Datatype
Definition FLA_type_defs.h:49
dim_t block_size
Definition FLASH_Queue_exec.c:113
FLA_Datatype datatype
Definition FLASH_Queue_exec.c:116
void * buffer_gpu
Definition FLASH_Queue_exec.c:43

References FLASH_Queue_variables::block_size, FLA_Obj_gpu_struct::buffer_gpu, FLASH_Queue_variables::datatype, FLASH_Queue_alloc_gpu(), FLASH_Queue_bind_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, and i.

Referenced by FLASH_Queue_exec_parallel_function().

◆ FLASH_Queue_destroy_gpu()

void FLASH_Queue_destroy_gpu ( int  thread,
void arg 
)
1261{
1263 int i;
1266
1267 // Exit if not using GPU.
1269 return;
1270
1271 // Examine every block left on the GPU.
1272 for ( i = 0; i < gpu_n_blocks; i++ )
1273 {
1274 gpu_obj = args->gpu[thread * gpu_n_blocks + i];
1275
1276 // Flush the blocks that are dirty.
1277 if ( gpu_obj.obj.base != NULL && !gpu_obj.clean )
1278 FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
1279
1280 // Free the memory on the GPU for all the blocks.
1281 FLASH_Queue_free_gpu( gpu_obj.buffer_gpu );
1282 }
1283
1284 return;
1285}
FLA_Error FLASH_Queue_free_gpu(void *buffer_gpu)
Definition FLASH_Queue_gpu.c:171
FLA_Error FLASH_Queue_read_gpu(FLA_Obj obj, void *buffer_gpu)
Definition FLASH_Queue_gpu.c:205
Definition FLASH_Queue_exec.c:38

References FLASH_Queue_free_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, and i.

Referenced by FLASH_Queue_exec_parallel_function().

◆ FLASH_Queue_exec()

void FLASH_Queue_exec ( void  )
127{
128 int n_tasks = FLASH_Queue_get_num_tasks();
130 int n_queues;
131 int n_caches;
132 int size;
133 int i;
134 dim_t block_size = FLASH_Queue_get_block_size();
135 double dtime;
136
137 FLA_Lock* run_lock;
138 FLA_Lock* dep_lock;
139 FLA_Lock* war_lock;
140 FLA_Lock* cac_lock;
141
142 FLA_Obj* cache;
143 FLA_Obj* prefetch;
144 FLASH_Queue* wait_queue;
145
146#ifdef FLA_ENABLE_GPU
147#ifdef FLA_ENABLE_MULTITHREADING
148 FLA_Lock* gpu_lock;
149#endif
150 FLA_Obj_gpu* gpu;
151 FLA_Obj_gpu* victim;
152 FLA_Obj_gpu* gpu_log;
154#endif
155
156 // All the necessary variables for the SuperMatrix mechanism.
157 FLASH_Queue_vars args;
158
159 // If the queue is empty, return early.
160 if ( n_tasks == 0 )
161 return;
162
163#ifndef FLA_ENABLE_MULTITHREADING
164 // Turn off work stealing in simulation mode.
166#endif
167
168 // Query the number of user set threads per queue.
170
171 // Default user setting for number of threads.
172 if ( n_queues <= 0 )
173 {
174 // Do not use data affinity or work stealing when caching is enabled.
176 {
179 }
180
181 // Do not use work stealing when data affinity is enabled.
183 {
185 }
186
187 // Allocate different arrays if using data affinity.
188 n_queues = ( FLASH_Queue_get_data_affinity() ==
191 }
192 else
193 {
194 // Set the number of queues.
195 n_queues = n_threads / n_queues;
196
197 // Must use at least one queue.
198 if ( n_queues == 0 )
199 n_queues = 1;
200
201 if ( n_queues == 1 )
202 {
203 // Turn off all multiple queue implementations.
206 }
207 else
208 {
209 // Use 2D data affinity for multiple queues if nothing is set.
212 {
214 }
215 }
216 }
217
218 // Determine the number of caches.
220
221 args.n_queues = n_queues;
222 args.n_caches = n_caches;
223
224#ifdef FLA_ENABLE_MULTITHREADING
225 // Allocate memory for array of locks.
226 run_lock = ( FLA_Lock* ) FLA_malloc( n_queues * sizeof( FLA_Lock ) );
227 dep_lock = ( FLA_Lock* ) FLA_malloc( n_threads * sizeof( FLA_Lock ) );
228 war_lock = ( FLA_Lock* ) FLA_malloc( n_threads * sizeof( FLA_Lock ) );
229 cac_lock = ( FLA_Lock* ) FLA_malloc( n_caches * sizeof( FLA_Lock ) );
230
231 args.run_lock = run_lock;
232 args.dep_lock = dep_lock;
233 args.war_lock = war_lock;
234 args.cac_lock = cac_lock;
235
236 // Initialize the all lock.
237 FLA_Lock_init( &(args.all_lock) );
238
239 // Initialize the run lock for thread i.
240 for ( i = 0; i < n_queues; i++ )
241 {
242 FLA_Lock_init( &(args.run_lock[i]) );
243 }
244
245 // Initialize the dep and war locks for thread i.
246 for ( i = 0; i < n_threads; i++ )
247 {
248 FLA_Lock_init( &(args.dep_lock[i]) );
249 FLA_Lock_init( &(args.war_lock[i]) );
250 }
251
252 // Initialize the cac locks for each cache.
253 for ( i = 0; i < n_caches; i++ )
254 {
255 FLA_Lock_init( &(args.cac_lock[i]) );
256 }
257#endif
258
259 // The number of blocks that can fit into the cache on each thread.
260 if ( block_size == 0 )
261 size = MIN_CACHE_BLOCKS;
262 else
263 size = max( FLASH_Queue_get_cache_size() / block_size, MIN_CACHE_BLOCKS);
264 args.size = size;
265
266 // Allocate memory for cache, prefetch buffer, and waiting queue.
267 cache = ( FLA_Obj* ) FLA_malloc( size * n_caches * sizeof( FLA_Obj ) );
268 prefetch = ( FLA_Obj* ) FLA_malloc( size * sizeof( FLA_Obj ) );
269 wait_queue = ( FLASH_Queue* ) FLA_malloc( n_queues * sizeof( FLASH_Queue ));
270
271 args.cache = cache;
272 args.prefetch = prefetch;
273 args.wait_queue = wait_queue;
274
275 // Initialize cache, prefetch buffer, and waiting queue.
276 for ( i = 0; i < size * n_caches; i++ )
277 args.cache[i].base = NULL;
278
279 for ( i = 0; i < size; i++ )
280 args.prefetch[i].base = NULL;
281
282 for ( i = 0; i < n_queues; i++ )
283 {
284 args.wait_queue[i].n_tasks = 0;
285 args.wait_queue[i].head = NULL;
286 args.wait_queue[i].tail = NULL;
287 }
288
289 // Initialize the aggregate task counter.
290 args.pc = 0;
291
292#ifdef FLA_ENABLE_GPU
293#ifdef FLA_ENABLE_MULTITHREADING
294 // Allocate and initialize the gpu locks.
295 gpu_lock = ( FLA_Lock* ) FLA_malloc( n_threads * sizeof( FLA_Lock ) );
296 args.gpu_lock = gpu_lock;
297
298 for ( i = 0; i < n_threads; i++ )
299 FLA_Lock_init( &(args.gpu_lock[i]) );
300#endif
301 // Allocate and initialize GPU software cache.
302 gpu = ( FLA_Obj_gpu* ) FLA_malloc( gpu_n_blocks * n_threads * sizeof( FLA_Obj_gpu ) );
303 args.gpu = gpu;
304
305 for ( i = 0; i < gpu_n_blocks * n_threads; i++ )
306 {
307 args.gpu[i].obj.base = NULL;
308 args.gpu[i].buffer_gpu = NULL;
309 args.gpu[i].clean = TRUE;
310 args.gpu[i].request = FALSE;
311 }
312
313 victim = ( FLA_Obj_gpu* ) FLA_malloc( n_threads * sizeof( FLA_Obj_gpu ) );
314 args.victim = victim;
315
316 for ( i = 0; i < n_threads; i++ )
317 args.victim[i].obj.base = NULL;
318
319 gpu_log = ( FLA_Obj_gpu* ) FLA_malloc( gpu_n_blocks * n_threads * sizeof( FLA_Obj_gpu ) );
320 args.gpu_log = gpu_log;
321#endif
322
323 // Initialize tasks with critical information.
324 FLASH_Queue_init_tasks( ( void* ) &args );
325
326 // Display verbose output before free all tasks.
329
330 // Start timing the parallel execution.
331 dtime = FLA_Clock();
332
333#ifdef FLA_ENABLE_MULTITHREADING
334 // Parallel Execution!
335 FLASH_Queue_exec_parallel( ( void* ) &args );
336#else
337 // Simulation!
338 FLASH_Queue_exec_simulation( ( void* ) &args );
339#endif
340
341 // End timing the parallel execution.
342 dtime = FLA_Clock() - dtime;
344
345#ifdef FLA_ENABLE_MULTITHREADING
346 // Destroy the locks.
347 FLA_Lock_destroy( &(args.all_lock) );
348
349 for ( i = 0; i < n_queues; i++ )
350 {
351 FLA_Lock_destroy( &(args.run_lock[i]) );
352 }
353
354 for ( i = 0; i < n_threads; i++ )
355 {
356 FLA_Lock_destroy( &(args.dep_lock[i]) );
357 FLA_Lock_destroy( &(args.war_lock[i]) );
358 }
359
360 for ( i = 0; i < n_caches; i++ )
361 {
362 FLA_Lock_destroy( &(args.cac_lock[i]) );
363 }
364
365 // Deallocate memory.
366 FLA_free( run_lock );
367 FLA_free( dep_lock );
368 FLA_free( war_lock );
369 FLA_free( cac_lock );
370#endif
371
372 FLA_free( cache );
373 FLA_free( prefetch );
374 FLA_free( wait_queue );
375
376#ifdef FLA_ENABLE_GPU
377#ifdef FLA_ENABLE_MULTITHREADING
378 for ( i = 0; i < n_threads; i++ )
379 FLA_Lock_destroy( &(args.gpu_lock[i]) );
380 FLA_free( gpu_lock );
381#endif
382 FLA_free( gpu );
383 FLA_free( victim );
384 FLA_free( gpu_log );
385#endif
386
387 // Reset values for next call to FLASH_Queue_exec().
389
390 return;
391}
void FLASH_Queue_exec_parallel(void *arg)
Definition FLASH_Queue_exec.c:2043
void FLASH_Queue_exec_simulation(void *arg)
Definition FLASH_Queue_exec.c:2589
void FLASH_Queue_init_tasks(void *arg)
Definition FLASH_Queue_exec.c:394
void FLASH_Queue_set_data_affinity(FLASH_Data_aff data_affinity)
Definition FLASH_Queue.c:391
FLASH_Verbose FLASH_Queue_get_verbose_output(void)
Definition FLASH_Queue.c:308
FLA_Bool FLASH_Queue_get_work_stealing(void)
Definition FLASH_Queue.c:380
FLA_Bool FLASH_Queue_get_caching(void)
Definition FLASH_Queue.c:356
void FLASH_Queue_set_parallel_time(double dtime)
Definition FLASH_Queue.c:448
void FLASH_Queue_verbose_output(void)
Definition FLASH_Queue.c:1782
dim_t FLASH_Queue_get_cache_size(void)
Definition FLASH_Queue.c:500
void FLASH_Queue_set_work_stealing(FLA_Bool work_stealing)
Definition FLASH_Queue.c:367
int FLASH_Queue_get_cores_per_cache(void)
Definition FLASH_Queue.c:548
FLASH_Data_aff FLASH_Queue_get_data_affinity(void)
Definition FLASH_Queue.c:404
int FLASH_Queue_get_cores_per_queue(void)
Definition FLASH_Queue.c:572
void FLASH_Queue_reset(void)
Definition FLASH_Queue.c:583
unsigned int FLASH_Queue_get_num_tasks(void)
Definition FLASH_Queue.c:284
dim_t FLASH_Queue_get_block_size(void)
Definition FLASH_Queue.c:476
void FLA_free(void *ptr)
Definition FLA_Memory.c:247
void * FLA_malloc(size_t size)
Definition FLA_Memory.c:111
void FLA_Lock_init(FLA_Lock *fla_lock_ptr)
Definition FLA_Lock.c:28
void FLA_Lock_destroy(FLA_Lock *fla_lock_ptr)
Definition FLA_Lock.c:73
double FLA_Clock(void)
Definition FLA_Clock.c:20
Definition FLA_type_defs.h:174
FLASH_Task * tail
Definition FLA_type_defs.h:180
FLASH_Task * head
Definition FLA_type_defs.h:179
unsigned int n_tasks
Definition FLA_type_defs.h:176
FLA_Lock * war_lock
Definition FLASH_Queue_exec.c:70
int pc
Definition FLASH_Queue_exec.c:96
int n_queues
Definition FLASH_Queue_exec.c:77
FLA_Obj_gpu * gpu_log
Definition FLASH_Queue_exec.c:110
FLA_Lock all_lock
Definition FLASH_Queue_exec.c:58
FLA_Obj * cache
Definition FLASH_Queue_exec.c:86
FLA_Lock * cac_lock
Definition FLASH_Queue_exec.c:74
FLASH_Queue * wait_queue
Definition FLASH_Queue_exec.c:92
int n_caches
Definition FLASH_Queue_exec.c:80
FLA_Lock * dep_lock
Definition FLASH_Queue_exec.c:66
FLA_Obj * prefetch
Definition FLASH_Queue_exec.c:89
int size
Definition FLASH_Queue_exec.c:83
FLA_Lock * run_lock
Definition FLASH_Queue_exec.c:62
Definition FLA_type_defs.h:102

References FLASH_Queue_variables::all_lock, FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLASH_Queue_variables::cac_lock, FLASH_Queue_variables::cache, FLA_Obj_gpu_struct::clean, FLASH_Queue_variables::dep_lock, FLA_Clock(), FLA_free(), FLA_is_owner(), FLA_Lock_destroy(), FLA_Lock_init(), FLA_malloc(), FLA_shfree(), FLA_shmalloc(), FLASH_Queue_exec_parallel(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), FLASH_Queue_get_block_size(), FLASH_Queue_get_cache_size(), FLASH_Queue_get_caching(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_cores_per_queue(), FLASH_Queue_get_data_affinity(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLASH_Queue_get_work_stealing(), FLASH_Queue_init_tasks(), FLASH_Queue_reset(), FLASH_Queue_set_caching(), FLASH_Queue_set_data_affinity(), FLASH_Queue_set_parallel_time(), FLASH_Queue_set_work_stealing(), FLASH_Queue_verbose_output(), FLASH_Task_free(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Queue_variables::gpu_log, FLASH_Queue_s::head, i, FLASH_Queue_variables::n_caches, FLASH_Queue_variables::n_queues, FLASH_Queue_variables::n_ready, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLA_Obj_gpu_struct::obj, FLASH_Queue_variables::pc, FLASH_Queue_variables::prefetch, RCCE_wtime(), FLA_Obj_gpu_struct::request, FLASH_Queue_variables::run_lock, FLASH_Queue_variables::size, Synch_all(), FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, FLASH_Queue_variables::victim, FLASH_Queue_variables::wait_queue, and FLASH_Queue_variables::war_lock.

Referenced by FLASH_Queue_end().

◆ FLASH_Queue_exec_gpu()

FLA_Bool FLASH_Queue_exec_gpu ( FLASH_Task t,
void arg 
)
1294{
1295 void** input_arg;
1296 void** output_arg;
1297
1298 if ( t == NULL )
1299 return TRUE;
1300
1301 // If not using the GPU, then execute on CPU.
1303 {
1305
1306 return TRUE;
1307 }
1308
1309 // Check if all the operands are ready and up to date.
1310 if ( !FLASH_Queue_check_gpu( t, arg ) )
1311 {
1313 int queue = t->queue;
1314 t->hit = FALSE;
1315
1316#ifdef FLA_ENABLE_MULTITHREADING
1317 FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
1318#endif
1319 // Reenqueue the task if the blocks are not all flushed.
1321
1322#ifdef FLA_ENABLE_MULTITHREADING
1323 FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
1324#endif
1325
1326 return FALSE;
1327 }
1328
1329 // If GPU is enabled, but the task is not supported for GPU execution.
1330 if ( !t->enabled_gpu )
1331 {
1332 int i, j, k;
1333 int thread = t->thread;
1334 int n_input_args = t->n_input_args;
1335 int n_output_args = t->n_output_args;
1338 FLA_Obj obj;
1339
1340 // Check the blocks on each GPU.
1341 for ( k = 0; k < n_threads; k++ )
1342 {
1343 // Check the input and output arguments on the GPUs.
1344 for ( i = 0; i < n_input_args + n_output_args; i++ )
1345 {
1346 // Check for duplicate blocks.
1347 duplicate = FALSE;
1348
1349 // Find the correct input or output argument.
1350 if ( i < n_input_args )
1351 {
1352 obj = t->input_arg[i];
1353
1354 for ( j = 0; j < n_output_args && !duplicate; j++ )
1355 {
1356 if ( obj.base == t->output_arg[j].base )
1357 duplicate = TRUE;
1358 }
1359
1360 for ( j = 0; j < i && !duplicate; j++ )
1361 {
1362 if ( obj.base == t->input_arg[j].base )
1363 duplicate = TRUE;
1364 }
1365 }
1366 else
1367 {
1368 obj = t->output_arg[i - n_input_args];
1369
1370 for ( j = 0; j < i - n_input_args && !duplicate; j++ )
1371 {
1372 if ( obj.base == t->output_arg[j].base )
1373 duplicate = TRUE;
1374 }
1375 }
1376
1377 // If the block has not been processed before.
1378 if ( !duplicate )
1379 {
1380 // Macroblock is used.
1381 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
1382 {
1383 dim_t jj, kk;
1384 dim_t m = FLA_Obj_length( obj );
1385 dim_t n = FLA_Obj_width( obj );
1386 dim_t cs = FLA_Obj_col_stride( obj );
1387 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
1388
1389 // Clear each block in macroblock.
1390 for ( jj = 0; jj < n; jj++ )
1391 {
1392 for ( kk = 0; kk < m; kk++ )
1393 {
1394 obj = *( buf + jj * cs + kk );
1395
1396 // Flush the block to main memory if it is on the GPU.
1397 if ( k == thread )
1399
1400 // Invalidate output block on all GPUs.
1401 if ( i >= n_input_args )
1403 }
1404 }
1405 }
1406 else
1407 {
1408 // Flush the block to main memory if it is on the GPU.
1409 if ( k == thread )
1411
1412 // Invalidate output block on all GPUs.
1413 if ( i >= n_input_args )
1415 }
1416 }
1417 }
1418 }
1419
1420 // Execute the task on CPU instead of GPU.
1422
1423 return TRUE;
1424 }
1425
1426 // Gather the pointers for the data on the GPU.
1427 input_arg = ( void** ) FLA_malloc( t->n_input_args * sizeof( void* ) );
1428 output_arg = ( void** ) FLA_malloc( t->n_output_args * sizeof( void* ) );
1429
1430 // Bring all the blocks to GPU.
1431 FLASH_Queue_update_gpu( t, input_arg, output_arg, arg );
1432
1433 // Execute the task on GPU.
1434 FLASH_Queue_exec_task_gpu( t, input_arg, output_arg );
1435
1436 // Mark all the output blocks as dirty.
1438
1439 // Free memory.
1440 FLA_free( input_arg );
1441 FLA_free( output_arg );
1442
1443 return TRUE;
1444}
void FLASH_Queue_flush_block_gpu(FLA_Obj obj, int thread, void *arg)
Definition FLASH_Queue_exec.c:1893
void FLASH_Queue_update_gpu(FLASH_Task *t, void **input_arg, void **output_arg, void *arg)
Definition FLASH_Queue_exec.c:1590
void FLASH_Queue_invalidate_block_gpu(FLA_Obj obj, int thread, void *arg)
Definition FLASH_Queue_exec.c:1844
FLA_Bool FLASH_Queue_check_gpu(FLASH_Task *t, void *arg)
Definition FLASH_Queue_exec.c:1447
void FLASH_Queue_wait_enqueue(FLASH_Task *t, void *arg)
Definition FLASH_Queue_exec.c:626
void FLASH_Queue_mark_gpu(FLASH_Task *t, void *arg)
Definition FLASH_Queue_exec.c:1787
void FLASH_Queue_exec_task_gpu(FLASH_Task *t, void **input_arg, void **output_arg)
Definition FLASH_Queue_gpu.c:225
void FLASH_Queue_exec_task(FLASH_Task *t)
Definition FLASH_Queue.c:1141

References FLA_Obj_view::base, FLA_free(), FLA_Lock_acquire(), FLA_Lock_release(), FLA_malloc(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_check_gpu(), FLASH_Queue_exec_task(), FLASH_Queue_exec_task_gpu(), FLASH_Queue_flush_block_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_num_threads(), FLASH_Queue_invalidate_block_gpu(), FLASH_Queue_mark_gpu(), FLASH_Queue_update_gpu(), FLASH_Queue_wait_enqueue(), i, and FLASH_Queue_variables::run_lock.

Referenced by FLASH_Queue_exec_parallel_function().

◆ FLASH_Queue_exec_parallel()

void FLASH_Queue_exec_parallel ( void arg)
2049{
2050 int i;
2052 void* (*thread_entry_point)( void* );
2053
2054 // Allocate the thread structures array. Here, an array of FLASH_Thread
2055 // structures of length n_threads is allocated and the fields of each
2056 // structure set to appropriate values.
2057 FLASH_Thread* thread = ( FLASH_Thread* ) FLA_malloc( n_threads * sizeof( FLASH_Thread ) );
2058
2059 // Initialize the thread structures array.
2060 for ( i = 0; i < n_threads; i++ )
2061 {
2062 // Save the thread's identifier.
2063 thread[i].id = i;
2064
2065 // Save the pointer to the necessary variables with the thread.
2066 thread[i].args = arg;
2067
2068 // The pthread object, if it was even compiled into the FLASH_Thread
2069 // structure, will be initialized by the pthread implementation when we
2070 // call pthread_create() and does not need to be touched at this time.
2071 }
2072
2073 // Determine which function to send threads to.
2075
2076#if FLA_MULTITHREADING_MODEL == FLA_OPENMP
2077
2078 // An OpenMP parallel for region spawns n_threads threads. Each thread
2079 // executes the work function with a different FLASH_Thread argument.
2080 // An implicit synchronization point exists at the end of the curly
2081 // brace scope.
2082 #pragma omp parallel for \
2083 private( i ) \
2084 shared( thread, n_threads, thread_entry_point ) \
2085 schedule( static, 1 ) \
2086 num_threads( n_threads )
2087 for ( i = 0; i < n_threads; ++i )
2088 {
2089 thread_entry_point( ( void* ) &thread[i] );
2090 }
2091
2092#elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
2093
2094 // Create each POSIX thread needed in addition to the main thread.
2095 for ( i = 1; i < n_threads; i++ )
2096 {
2097 int pthread_e_val;
2098
2099 // Create thread i with default attributes.
2100 pthread_e_val = pthread_create( &(thread[i].pthread_obj),
2101 NULL,
2103 ( void* ) &thread[i] );
2104
2105#ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING
2108#endif
2109 }
2110
2111 // The main thread is assigned the role of thread 0. Here we manually
2112 // execute it as a worker thread.
2113 thread_entry_point( ( void* ) &thread[0] );
2114
2115 // Wait for non-main threads to finish.
2116 for ( i = 1; i < n_threads; i++ )
2117 {
2118 // These two variables are declared local to this for loop since this
2119 // is the only place they are needed, and since they would show up as
2120 // unused variables if FLA_MULTITHREADING_MODEL == FLA_PTHREADS.
2121 // Strangely, the Intel compiler produces code that results in an
2122 // "unaligned access" runtime message if thread_status is declared as
2123 // an int. Declaring it as a long or void* appears to force the
2124 // compiler (not surprisingly) into aligning it to an 8-byte boundary.
2125 int pthread_e_val;
2126 void* thread_status;
2127
2128 // Wait for thread i to invoke its respective pthread_exit().
2129 // The return value passed to pthread_exit() is provided to us
2130 // via status, if one was given.
2131 pthread_e_val = pthread_join( thread[i].pthread_obj,
2132 ( void** ) &thread_status );
2133
2134#ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING
2137#endif
2138 }
2139
2140#endif
2141
2142 FLA_free( thread );
2143
2144 return;
2145}
void * FLASH_Queue_exec_parallel_function(void *arg)
Definition FLASH_Queue_exec.c:2156
FLA_Error FLA_Check_pthread_create_result(int pthread_create_r_val)
Definition FLA_Check.c:750
FLA_Error FLA_Check_pthread_join_result(int pthread_join_r_val)
Definition FLA_Check.c:760
int FLA_Error
Definition FLA_type_defs.h:47
Definition FLA_type_defs.h:255
int id
Definition FLA_type_defs.h:257

References FLA_Check_pthread_create_result(), FLA_Check_pthread_join_result(), FLA_free(), FLA_malloc(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_get_num_threads(), i, and FLASH_Thread_s::id.

Referenced by FLASH_Queue_exec().

◆ FLASH_Queue_exec_parallel_function()

void * FLASH_Queue_exec_parallel_function ( void arg)
2162{
2163 FLASH_Queue_vars* args;
2164 int i;
2165 int queue;
2166 int cache;
2167 int n_tasks = FLASH_Queue_get_num_tasks();
2176 FLASH_Task* t = NULL;
2177 FLASH_Task* r = NULL;
2179 //cpu_set_t cpu_set;
2180
2181 // Interpret the thread argument as what it really is--a pointer to an
2182 // FLASH_Thread structure.
2183 me = ( FLASH_Thread* ) arg;
2184
2185 // Extract the variables from the current thread.
2186 args = ( FLASH_Queue_vars* ) me->args;
2187
2188 // Figure out the id of the current thread.
2189 i = me->id;
2190
2191 // Set the CPU affinity; We want the current thread i to run only on CPU i.
2192 //CPU_ZERO( &cpu_set );
2193 //CPU_SET( i, &cpu_set );
2194 //sched_setaffinity( syscall( __NR_gettid ), sizeof(cpu_set_t), &cpu_set );
2195
2196 // Determine to which queue this thread belongs.
2197 queue = i / ( n_threads / args->n_queues );
2198
2199 // Determine to which cache this thread belongs.
2200 cache = i / n_cores;
2201
2202#ifdef FLA_ENABLE_GPU
2203 // Create memory on GPU.
2204 FLASH_Queue_create_gpu( i, ( void* ) args );
2205
2206 // Save whether GPUs are enabled.
2208
2209 // Only use each GPU as its own cache when GPUs are enabled.
2210 if ( enabled )
2211 cache = i;
2212#endif
2213
2214 // Prefetch blocks into the cache before execution.
2215 if ( caching && !enabled && i % n_cores == 0 )
2216 FLASH_Queue_prefetch( cache, ( void* ) args );
2217
2218 // Loop until all the tasks have committed.
2219 while ( condition )
2220 {
2221#ifdef FLA_ENABLE_GPU
2222 // Check to see if any blocks on GPU need to be flushed.
2223 FLASH_Queue_flush_gpu( i, ( void* ) args );
2224#endif
2225
2226 // Dequeue a task if there has not been one binded to thread.
2227 if ( r == NULL )
2228 {
2229 FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
2230
2231 // Obtain task to execute.
2232 t = FLASH_Queue_wait_dequeue( queue, cache, ( void* ) args );
2233
2234 FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
2235 }
2236 else
2237 {
2238 // Obtain the binded task.
2239 t = r;
2240 r = NULL;
2241 }
2242
2243 // Dequeued a task from the waiting queue.
2244 available = ( t != NULL );
2245
2246 if ( available )
2247 {
2248 // Save the thread and cache that executes the task.
2249 t->thread = i;
2250 t->cache = cache;
2251
2252 if ( caching && !enabled )
2253 {
2254 // Update the current state of the cache.
2255 FLASH_Queue_update_cache( t, ( void* ) args );
2256 }
2257
2258#ifdef FLA_ENABLE_GPU
2259 // Execute the task on GPU.
2260 committed = FLASH_Queue_exec_gpu( t, ( void* ) args );
2261#else
2262 // Execute the task.
2264#endif
2265
2266 // If the task has executed or not.
2267 if ( committed )
2268 {
2269 // Update task dependencies.
2270 r = FLASH_Task_update_dependencies( t, ( void* ) args );
2271
2272 // Free the task once it executes in parallel.
2273 FLASH_Task_free_parallel( t, ( void* ) args );
2274 }
2275 }
2276 else
2277 {
2278 if ( stealing )
2279 {
2280 // Perform work stealing if there are no tasks to dequeue.
2281 r = FLASH_Queue_work_stealing( queue, ( void* ) args );
2282 }
2283 }
2284
2285 FLA_Lock_acquire( &(args->all_lock) ); // A ***
2286
2287 // Increment program counter.
2288 if ( available && committed )
2289 args->pc++;
2290
2291 // Terminate loop.
2292 if ( args->pc >= n_tasks )
2293 condition = FALSE;
2294
2295 FLA_Lock_release( &(args->all_lock) ); // A ***
2296 }
2297
2298#ifdef FLA_ENABLE_GPU
2299 // Destroy and flush contents of GPU back to main memory.
2300 FLASH_Queue_destroy_gpu( i, ( void* ) args );
2301#endif
2302
2303#if FLA_MULTITHREADING_MODEL == FLA_PTHREADS
2304 // If this is a non-main thread, then exit with a zero (normal) error code.
2305 // The main thread cannot call pthread_exit() because this routine never
2306 // returns. The main thread must proceed so it can oversee the joining of
2307 // the exited non-main pthreads.
2308 if ( i != 0 )
2309 pthread_exit( ( void* ) NULL );
2310#endif
2311
2312 return ( void* ) NULL;
2313}
void FLASH_Task_free_parallel(FLASH_Task *t, void *arg)
Definition FLASH_Queue_exec.c:2446
void FLASH_Queue_create_gpu(int thread, void *arg)
Definition FLASH_Queue_exec.c:1227
FLA_Bool FLASH_Queue_exec_gpu(FLASH_Task *t, void *arg)
Definition FLASH_Queue_exec.c:1288
void FLASH_Queue_prefetch(int cache, void *arg)
Definition FLASH_Queue_exec.c:1024
FLASH_Task * FLASH_Task_update_dependencies(FLASH_Task *t, void *arg)
Definition FLASH_Queue_exec.c:2316
void FLASH_Queue_flush_gpu(int thread, void *arg)
Definition FLASH_Queue_exec.c:1961
FLASH_Task * FLASH_Queue_wait_dequeue(int queue, int cache, void *arg)
Definition FLASH_Queue_exec.c:678
void FLASH_Queue_destroy_gpu(int thread, void *arg)
Definition FLASH_Queue_exec.c:1255
void FLASH_Queue_update_cache(FLASH_Task *t, void *arg)
Definition FLASH_Queue_exec.c:847
FLASH_Task * FLASH_Queue_work_stealing(int queue, void *arg)
Definition FLASH_Queue_exec.c:1155
Definition FLA_type_defs.h:184

References FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_create_gpu(), FLASH_Queue_destroy_gpu(), FLASH_Queue_exec_gpu(), FLASH_Queue_exec_task(), FLASH_Queue_flush_gpu(), FLASH_Queue_get_caching(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_work_stealing(), FLASH_Queue_prefetch(), FLASH_Queue_update_cache(), FLASH_Queue_wait_dequeue(), FLASH_Queue_work_stealing(), FLASH_Task_free_parallel(), FLASH_Task_update_dependencies(), i, FLASH_Queue_variables::n_queues, RCCE_acquire_lock(), RCCE_release_lock(), RCCE_ue(), and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec(), and FLASH_Queue_exec_parallel().

◆ FLASH_Queue_exec_simulation()

void FLASH_Queue_exec_simulation ( void arg)
2595{
2597 int i, j;
2598 int queue;
2599 int cache;
2600 int n_stages = 0;
2601 int n_queues = args->n_queues;
2602 int n_tasks = FLASH_Queue_get_num_tasks();
2606 FLASH_Task* task;
2607 FLASH_Task* t;
2608 FLASH_Dep* d;
2609
2610 // An array to hold tasks to be executed during of simulation.
2611#ifdef FLA_ENABLE_WINDOWS_BUILD
2613#else
2615#endif
2616
2617 for ( i = 0; i < n_threads; i++ )
2618 {
2619 // Initialize all exec_array to NULL.
2620 exec_array[i] = NULL;
2621
2622 // Prefetch blocks into the cache before execution.
2623 if ( i % n_cores == 0 )
2625 }
2626
2627 // Loop until all the tasks have committed.
2628 while ( args->pc < n_tasks )
2629 {
2630 for ( i = 0; i < n_threads; i++ )
2631 {
2632 // Update waiting queue with ready tasks.
2633 t = exec_array[i];
2634
2635 if ( t != NULL )
2636 {
2637 // Check each dependent task.
2638 d = t->dep_arg_head;
2639
2640 for ( j = 0; j < t->n_dep_args; j++ )
2641 {
2642 task = d->task;
2643 task->n_ready--;
2644
2645 // Place newly ready tasks on waiting queue.
2646 if ( task->n_ready == 0 )
2647 {
2649 }
2650
2651 // Go to the next dep.
2652 d = d->next_dep;
2653 }
2654
2655 // Free the task.
2656 FLASH_Task_free( t );
2657 }
2658 }
2659
2660 n_stages++;
2661 if ( !verbose )
2662 printf( "%7d", n_stages );
2663
2664 // Move ready tasks from the waiting queue to execution queue.
2665 for ( i = 0; i < n_threads; i++ )
2666 {
2667 // Determine to which queue this thread belongs.
2668 queue = i / ( n_threads / n_queues );
2669
2670 // Determine to which cache this thread belongs.
2671 cache = i / n_cores;
2672
2673 // Dequeue a task.
2674 t = FLASH_Queue_wait_dequeue( queue, cache, arg );
2675
2676 // Save the task for execution.
2677 exec_array[i] = t;
2678
2679 if ( t != NULL )
2680 {
2681 // Save the thread and cache that executes the task.
2682 t->thread = i;
2683 t->cache = cache;
2684
2685 // Increment program counter.
2686 args->pc++;
2687 }
2688 }
2689
2690 // Execute independent tasks.
2691 for ( i = 0; i < n_threads; i++ )
2692 {
2693 t = exec_array[i];
2696
2697 if ( !verbose )
2698 printf( "%7s", ( t == NULL ? " " : t->name ) );
2699
2700 // Free the task if this is the last stage.
2701 if ( args->pc == n_tasks && t != NULL )
2702 FLASH_Task_free( t );
2703 }
2704
2705 if ( !verbose )
2706 printf( "\n" );
2707 }
2708
2709 if ( !verbose )
2710 printf( "\n" );
2711
2712#ifdef FLA_ENABLE_WINDOWS_BUILD
2714#endif
2715
2716 return;
2717}
void FLASH_Task_free(FLASH_Task *t)
Definition FLASH_Queue.c:1020
int FLASH_Verbose
Definition FLA_type_defs.h:113
Definition FLA_type_defs.h:245
FLASH_Task * task
Definition FLA_type_defs.h:247
FLASH_Dep * next_dep
Definition FLA_type_defs.h:250
int n_ready
Definition FLA_type_defs.h:186

References FLA_free(), FLA_malloc(), FLASH_Queue_exec_task(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLASH_Queue_prefetch(), FLASH_Queue_update_cache(), FLASH_Queue_wait_dequeue(), FLASH_Queue_wait_enqueue(), FLASH_Task_free(), i, FLASH_Queue_variables::n_queues, FLASH_Task_s::n_ready, FLASH_Dep_s::next_dep, FLASH_Queue_variables::pc, and FLASH_Dep_s::task.

Referenced by FLASH_Queue_exec().

◆ FLASH_Queue_flush_block_gpu()

void FLASH_Queue_flush_block_gpu ( FLA_Obj  obj,
int  thread,
void arg 
)
1899{
1901 int k;
1905
1906#ifdef FLA_ENABLE_MULTITHREADING
1907 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1908#endif
1909
1910 // Locate the position of the block on the GPU.
1911 for ( k = 0; k < gpu_n_blocks; k++ )
1912 if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1913 break;
1914
1915 // The block is owned by the GPU.
1916 if ( k < gpu_n_blocks )
1917 {
1918 // Save the block that will be flushed.
1919 gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1920
1921 // If the block is dirty, then flush it.
1922 if ( gpu_obj.obj.base != NULL && !gpu_obj.clean )
1923 transfer = TRUE;
1924 }
1925
1926#ifdef FLA_ENABLE_MULTITHREADING
1927 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1928#endif
1929
1930 // Exit early if a flush is not required.
1931 if ( !transfer )
1932 return;
1933
1934 // Flush the block outside the critical section.
1935 FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
1936
1937#ifdef FLA_ENABLE_MULTITHREADING
1938 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1939#endif
1940
1941 // Locate the position of the block on the GPU.
1942 for ( k = 0; k < gpu_n_blocks; k++ )
1943 if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1944 break;
1945
1946 if ( k < gpu_n_blocks )
1947 {
1948 // Update the bits for the flushed block.
1949 args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
1950 args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1951 }
1952
1953#ifdef FLA_ENABLE_MULTITHREADING
1954 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1955#endif
1956
1957 return;
1958}

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, i, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.

Referenced by FLASH_Queue_exec_gpu().

◆ FLASH_Queue_flush_gpu()

void FLASH_Queue_flush_gpu ( int  thread,
void arg 
)
1967{
1969 int i, k;
1971 int n_transfer = 0;
1973
1974 // Exit if not using GPU.
1976 return;
1977
1978#ifdef FLA_ENABLE_MULTITHREADING
1979 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1980#endif
1981
1982 for ( k = 0; k < gpu_n_blocks; k++ )
1983 {
1984 // Save the block that might be flushed.
1985 gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1986
1987 // Flush the block if it is dirty and requested.
1988 if ( gpu_obj.obj.base != NULL && !gpu_obj.clean && gpu_obj.request )
1989 {
1990 // Save the block for data transfer outside the critical section.
1991 args->gpu_log[thread * gpu_n_blocks + n_transfer] = gpu_obj;
1992 n_transfer++;
1993 }
1994 }
1995
1996#ifdef FLA_ENABLE_MULTITHREADING
1997 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1998#endif
1999
2000 // Exit early if a flush is not required.
2001 if ( n_transfer == 0 )
2002 return;
2003
2004 // Flush the block outside the critical section.
2005 for ( i = 0; i < n_transfer; i++ )
2006 {
2007 gpu_obj = args->gpu_log[thread * gpu_n_blocks + i];
2008 FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
2009 }
2010
2011#ifdef FLA_ENABLE_MULTITHREADING
2012 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
2013#endif
2014
2015 // Update the bits for each block that is flushed.
2016 for ( i = 0; i < n_transfer; i++ )
2017 {
2018 // Locate the position of the block on the GPU.
2019 for ( k = 0; k < gpu_n_blocks; k++ )
2020 if ( args->gpu_log[thread * gpu_n_blocks + i].obj.base ==
2021 args->gpu[thread * gpu_n_blocks + k].obj.base )
2022 break;
2023
2024 if ( k < gpu_n_blocks )
2025 {
2026 // The block is now clean.
2027 args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
2028 args->gpu[thread * gpu_n_blocks + k].request = FALSE;
2029 }
2030 }
2031
2032#ifdef FLA_ENABLE_MULTITHREADING
2033 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
2034#endif
2035
2036 return;
2037}

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Queue_variables::gpu_log, i, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.

Referenced by FLASH_Queue_exec_parallel_function().

◆ FLASH_Queue_init_tasks()

void FLASH_Queue_init_tasks ( void arg)
400{
402 int i, j, k;
403 int n_tasks = FLASH_Queue_get_num_tasks();
404 int n_queues = args->n_queues;
405 int n_prefetch = 0;
406 int n_ready = 0;
407 int length = 0;
408 int width = 0;
409 int height = 0;
410 int size = args->size;
412 FLASH_Task* t;
413 FLASH_Dep* d;
414 FLA_Obj obj;
415
416#ifdef FLA_ENABLE_GPU
417 dim_t block_size = 0;
418 FLA_Datatype datatype = FLA_FLOAT;
420#endif
421
422 // Find the 2D factorization of the number of threads.
424 {
425 int sq_rt = 0;
426 while ( sq_rt * sq_rt <= n_queues ) sq_rt++;
427 sq_rt--;
428 while ( n_queues % sq_rt != 0 ) sq_rt--;
429 length = n_queues / sq_rt;
430 width = sq_rt;
431 }
432
433 // Grab the tail of the task queue.
435
436 for ( i = n_tasks - 1; i >= 0; i-- )
437 {
438 // Determine data affinity.
440 { // No data affinity
441 t->queue = 0;
442 }
443 else
444 {
445 // Use the first output block to determine data affinity.
446 obj = t->output_arg[0];
447
448 // Use the top left block of the macroblock.
449 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
450 obj = *FLASH_OBJ_PTR_AT( obj );
451
453 { // Two-dimensional block cyclic
454 t->queue = ( obj.base->m_index % length ) +
455 ( obj.base->n_index % width ) * length;
456 }
458 { // One-dimensional row block cyclic
459 t->queue = obj.base->m_index % n_queues;
460 }
462 { // One-dimensional column block cyclic
463 t->queue = obj.base->n_index % n_queues;
464 }
465 else
466 { // Round-robin
467 t->queue = t->queue % n_queues;
468 }
469 }
470
471 // Determine the height of each task in the DAG.
472 height = 0;
473 d = t->dep_arg_head;
474
475 // Take the maximum height of dependent tasks.
476 for ( j = 0; j < t->n_dep_args; j++ )
477 {
478 height = max( height, d->task->height );
479 d = d->next_dep;
480 }
481
482 t->height = height + 1;
483
484 // Since freeing a task is always a leaf, we want to force it to execute
485 // earlier by giving it a greater height in order to reclaim memory.
486 if ( t->func == (void *) FLA_Obj_free_buffer_task )
487 t->height += n_tasks;
488
489#ifdef FLA_ENABLE_GPU
490 for ( j = 0; j < t->n_output_args + t->n_input_args; j++ )
491 {
492 // Find the correct input or output argument.
493 if ( j < t->n_output_args )
494 obj = t->output_arg[j];
495 else
496 obj = t->input_arg[j - t->n_output_args];
497
498 // Macroblock is used.
499 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
500 {
501 dim_t jj, kk;
502 dim_t m = FLA_Obj_length( obj );
503 dim_t n = FLA_Obj_width( obj );
504 dim_t cs = FLA_Obj_col_stride( obj );
505 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
506
507 // Check each block in macroblock.
508 for ( jj = 0; jj < n; jj++ )
509 {
510 for ( kk = 0; kk < m; kk++ )
511 {
512 obj = *( buf + jj * cs + kk );
513
514 block_size = max( FLA_Obj_length( obj ) * FLA_Obj_width( obj ), block_size );
515
516 if ( jj == 0 && FLA_Obj_datatype( obj ) != datatype && FLA_Obj_datatype_size( FLA_Obj_datatype( obj ) ) > datatype_size )
517 {
518 datatype = FLA_Obj_datatype( obj );
520 }
521 }
522 }
523 }
524 else // Regular block.
525 {
526 block_size = max( FLA_Obj_length( obj ) * FLA_Obj_width( obj ), block_size );
527
528 if ( FLA_Obj_datatype( obj ) != datatype && FLA_Obj_datatype_size( FLA_Obj_datatype( obj ) ) > datatype_size )
529 {
530 datatype = FLA_Obj_datatype( obj );
532 }
533 }
534 }
535#endif
536
537 // Find the first blocks accessed each task.
538 if ( n_prefetch < size )
539 {
540 for ( j = 0; j < t->n_output_args; j++ )
541 {
542 obj = t->output_arg[j];
543
544 // Macroblock is used.
545 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
546 {
547 dim_t jj, kk;
548 dim_t m = FLA_Obj_length( obj );
549 dim_t n = FLA_Obj_width( obj );
550 dim_t cs = FLA_Obj_col_stride( obj );
551 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
552
553 // Check each block in macroblock.
554 for ( jj = 0; jj < n; jj++ )
555 {
556 for ( kk = 0; kk < m; kk++ )
557 {
558 obj = *( buf + jj * cs + kk );
559
560 k = obj.base->n_write_blocks;
561
562 // This block is one of the first blocks to be accessed.
563 if ( k < size && k == n_prefetch )
564 {
565 args->prefetch[k] = obj;
566 n_prefetch++;
567 }
568 }
569 }
570 }
571 else // Regular block.
572 {
573 k = obj.base->n_write_blocks;
574
575 // This block is one of the first blocks to be accessed.
576 if ( k < size && k == n_prefetch )
577 {
578 args->prefetch[k] = obj;
579 n_prefetch++;
580 }
581 }
582 }
583 }
584
585 // Find all ready tasks.
586 t->n_ready += t->n_input_args + t->n_output_args +
587 t->n_macro_args + t->n_war_args;
588
589 if ( t->n_ready == 0 )
590 {
591 // Save the number of ready and available tasks.
592 n_ready++;
593 }
594
595 // Go to the previous task.
596 t = t->prev_task;
597 }
598
599 // Grab the head of the task queue.
601
602 for ( i = 0; i < n_tasks && n_ready > 0; i++ )
603 {
604 if ( t->n_ready == 0 )
605 {
606 // Enqueue all the ready and available tasks.
608
609 // Decrement the number of ready tasks left to be enqueued.
610 n_ready--;
611 }
612
613 // Go to the next task.
614 t = t->next_task;
615 }
616
617#ifdef FLA_ENABLE_GPU
618 args->block_size = block_size;
619 args->datatype = datatype;
620#endif
621
622 return;
623}
FLASH_Task * FLASH_Queue_get_head_task(void)
Definition FLASH_Queue.c:603
FLASH_Task * FLASH_Queue_get_tail_task(void)
Definition FLASH_Queue.c:614
dim_t FLA_Obj_datatype_size(FLA_Datatype datatype)
Definition FLA_Query.c:61
FLA_Datatype FLA_Obj_datatype(FLA_Obj obj)
Definition FLA_Query.c:13
FLA_Error FLA_Obj_free_buffer_task(FLA_Obj obj, void *cntl)
Definition FLA_Obj_free_buffer_task.c:13
int FLASH_Data_aff
Definition FLA_type_defs.h:114
int height
Definition FLA_type_defs.h:191
int n_write_blocks
Definition FLA_type_defs.h:146
dim_t m_index
Definition FLA_type_defs.h:134
dim_t n_index
Definition FLA_type_defs.h:135

References FLA_Obj_view::base, FLASH_Queue_variables::block_size, FLASH_Queue_variables::datatype, FLASH_Task_s::dep_arg_head, FLA_is_owner(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_datatype_size(), FLA_Obj_elemtype(), FLA_Obj_free_buffer_task(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_data_affinity(), FLASH_Queue_get_head_task(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_tail_task(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::func, FLASH_Task_s::height, i, FLASH_Task_s::input_arg, FLA_Obj_struct::m_index, FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_index, FLASH_Task_s::n_input_args, FLASH_Task_s::n_macro_args, FLASH_Task_s::n_output_args, FLASH_Queue_variables::n_queues, FLASH_Task_s::n_ready, FLASH_Queue_variables::n_ready, FLASH_Task_s::n_war_args, FLA_Obj_struct::n_write_blocks, FLASH_Dep_s::next_dep, FLASH_Task_s::output_arg, FLASH_Queue_variables::prefetch, FLASH_Task_s::prev_task, FLASH_Task_s::queue, RCCE_acquire_lock(), RCCE_release_lock(), FLASH_Queue_variables::size, FLASH_Dep_s::task, and FLASH_Queue_variables::task_queue.

Referenced by FLASH_Queue_exec().

◆ FLASH_Queue_invalidate_block_gpu()

void FLASH_Queue_invalidate_block_gpu ( FLA_Obj  obj,
int  thread,
void arg 
)
1850{
1852 int j, k;
1855
1856#ifdef FLA_ENABLE_MULTITHREADING
1857 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1858#endif
1859
1860 // Locate the position of the block on the GPU.
1861 for ( k = 0; k < gpu_n_blocks; k++ )
1862 if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1863 break;
1864
1865 // The block is owned by other GPU.
1866 if ( k < gpu_n_blocks )
1867 {
1868 // Invalidate the block.
1869 args->gpu[thread * gpu_n_blocks + k].obj.base = NULL;
1870
1871 args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
1872 args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1873
1874 // Save the block that will be invalidated.
1875 gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1876
1877 // Shift all the blocks for the invalidated block.
1878 for ( j = k; j < gpu_n_blocks - 1; j++ )
1879 args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j + 1];
1880
1881 // Move to the LRU block.
1882 args->gpu[thread * gpu_n_blocks + gpu_n_blocks - 1] = gpu_obj;
1883 }
1884
1885#ifdef FLA_ENABLE_MULTITHREADING
1886 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1887#endif
1888
1889 return;
1890}

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, i, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.

Referenced by FLASH_Queue_exec_gpu(), and FLASH_Queue_update_gpu().

◆ FLASH_Queue_mark_gpu()

void FLASH_Queue_mark_gpu ( FLASH_Task t,
void arg 
)
1793{
1795 int i, j, k;
1796 int thread = t->thread;
1799 FLA_Obj obj;
1800
1801 // Mark all the output blocks on the GPU as dirty.
1802 for ( i = t->n_output_args - 1; i >= 0; i-- )
1803 {
1804 obj = t->output_arg[i];
1805
1806 // Check for duplicate blocks.
1807 duplicate = FALSE;
1808
1809 for ( j = 0; j < i && !duplicate; j++ )
1810 {
1811 if ( obj.base == t->output_arg[j].base )
1812 duplicate = TRUE;
1813 }
1814
1815 // If the output block has not been processed before.
1816 if ( !duplicate )
1817 {
1818#ifdef FLA_ENABLE_MULTITHREADING
1819 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1820#endif
1821
1822 // Locate the position of the block on the GPU.
1823 for ( k = 0; k < gpu_n_blocks; k++ )
1824 if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1825 break;
1826
1827 if ( k < gpu_n_blocks )
1828 {
1829 // Change the bits for the new dirty block.
1830 args->gpu[thread * gpu_n_blocks + k].clean = FALSE;
1831 args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1832 }
1833
1834#ifdef FLA_ENABLE_MULTITHREADING
1835 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1836#endif
1837 }
1838 }
1839
1840 return;
1841}

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, i, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.

Referenced by FLASH_Queue_exec_gpu().

◆ FLASH_Queue_prefetch()

void FLASH_Queue_prefetch ( int  cache,
void arg 
)
1030{
1032 int i;
1033 int size = args->size;
1034 FLA_Obj obj;
1035
1036 // Prefetch blocks in opposite order to maintain LRU.
1037 for ( i = size - 1; i >= 0; i-- )
1038 {
1039 obj = args->prefetch[i];
1040
1041 // Only prefetch if it is a valid block.
1042 if ( obj.base != NULL )
1043 {
1044 // Prefetch the block.
1046
1047 // Record the prefetched block in the cache.
1048 args->cache[cache * size + i] = obj;
1049 }
1050 }
1051
1052 return;
1053}
void FLASH_Queue_prefetch_block(FLA_Obj obj)
Definition FLASH_Queue_exec.c:1056

References FLA_Obj_view::base, FLASH_Queue_variables::cache, FLASH_Queue_prefetch_block(), i, FLASH_Queue_variables::prefetch, and FLASH_Queue_variables::size.

Referenced by FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().

◆ FLASH_Queue_prefetch_block()

void FLASH_Queue_prefetch_block ( FLA_Obj  obj)
1062{
1063 int i, inc;
1065 int elem_size = FLA_Obj_elem_size( obj );
1066 int length = FLA_Obj_length( obj );
1067 int width = FLA_Obj_width( obj );
1068 FLA_Datatype datatype = FLA_Obj_datatype( obj );
1069
1070 // Determine stride to prefetch block into cache.
1072
1073 // Switch between the four different datatypes.
1074 switch ( datatype )
1075 {
1076 case FLA_FLOAT:
1077 {
1078 float *buffer = ( float * ) FLA_FLOAT_PTR( obj );
1079 float access;
1080
1081 // Access each cache line of the block.
1082 for ( i = 0; i < length * width; i += inc )
1083 access = buffer[i];
1084
1085 // Prevent dead code elimination.
1086 access += 1.0;
1087
1088 break;
1089 }
1090 case FLA_DOUBLE:
1091 {
1092 double *buffer = ( double * ) FLA_DOUBLE_PTR( obj );
1093 double access;
1094
1095 // Access each cache line of the block.
1096 for ( i = 0; i < length * width; i += inc )
1097 access = buffer[i];
1098
1099 // Prevent dead code elimination.
1100 access += 1.0;
1101
1102 break;
1103 }
1104 case FLA_COMPLEX:
1105 {
1106 scomplex *buffer = ( scomplex * ) FLA_COMPLEX_PTR( obj );
1108
1109 // Access each cache line of the block.
1110 for ( i = 0; i < length * width; i += inc )
1111 access = buffer[i];
1112
1113 // Prevent dead code elimination.
1114 access.real += 1.0;
1115
1116 break;
1117 }
1118 case FLA_DOUBLE_COMPLEX:
1119 {
1120 dcomplex *buffer = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( obj );
1122
1123 // Access each cache line of the block.
1124 for ( i = 0; i < length * width; i += inc )
1125 access = buffer[i];
1126
1127 // Prevent dead code elimination.
1128 access.real += 1.0;
1129
1130 break;
1131 }
1132 case FLA_INT:
1133 {
1134 int *buffer = ( int * ) FLA_INT_PTR( obj );
1135 int access;
1136
1137 // Access each cache line of the block.
1138 for ( i = 0; i < length * width; i += inc )
1139 access = buffer[i];
1140
1141 // Prevent dead code elimination.
1142 access += 1.0;
1143
1144 break;
1145 }
1146 default:
1147 // This default case should never execute.
1149 }
1150
1151 return;
1152}
dim_t FLASH_Queue_get_cache_line_size(void)
Definition FLASH_Queue.c:524
dim_t FLA_Obj_elem_size(FLA_Obj obj)
Definition FLA_Query.c:95
Definition blis_type_defs.h:138
double real
Definition blis_type_defs.h:139
Definition blis_type_defs.h:133
float real
Definition blis_type_defs.h:134

References FLA_Obj_datatype(), FLA_Obj_elem_size(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_cache_line_size(), i, scomplex::real, and dcomplex::real.

Referenced by FLASH_Queue_prefetch().

◆ FLASH_Queue_update_block_gpu()

void FLASH_Queue_update_block_gpu ( FLA_Obj  obj,
void **  buffer_gpu,
int  thread,
void arg 
)
1704{
1706 int j, k;
1712
1713#ifdef FLA_ENABLE_MULTITHREADING
1714 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1715#endif
1716
1717 // Locate the position of the block on GPU.
1718 for ( k = 0; k < gpu_n_blocks - 1; k++ )
1719 if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1720 break;
1721
1722 // Save the pointer to the data on the GPU.
1723 buffer_gpu[0] = args->gpu[thread * gpu_n_blocks + k].buffer_gpu;
1724
1725 // Save the victim block.
1726 evict_obj = args->gpu[thread * gpu_n_blocks + k];
1727
1728 // The block is not already in the GPU.
1729 if ( obj.base != args->gpu[thread * gpu_n_blocks + k].obj.base )
1730 {
1731 // Save for data transfer outside of critical section.
1732 transfer = TRUE;
1733
1734 // Save for eviction outside of critical section.
1735 if ( evict_obj.obj.base != NULL && !evict_obj.clean )
1736 {
1737 evict = TRUE;
1738 args->victim[thread] = evict_obj;
1739 }
1740
1741 // Save the block in the data structure.
1742 args->gpu[thread * gpu_n_blocks + k].obj = obj;
1743
1744 // Make sure the new block is clean.
1745 args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
1746 args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1747 }
1748
1749 // Use the block on the GPU that is a hit or LRU.
1750 gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1751
1752 // Shift all the previous tasks for LRU replacement.
1753 for ( j = k; j > 0; j-- )
1754 args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j - 1];
1755
1756 // Place the block on the cache as the most recently used.
1757 args->gpu[thread * gpu_n_blocks] = gpu_obj;
1758
1759#ifdef FLA_ENABLE_MULTITHREADING
1760 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1761#endif
1762
1763 // Evict and flush the LRU dirty block.
1764 if ( evict )
1765 {
1766 FLASH_Queue_read_gpu( evict_obj.obj, evict_obj.buffer_gpu );
1767
1768#ifdef FLA_ENABLE_MULTITHREADING
1769 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1770#endif
1771
1772 args->victim[thread].obj.base = NULL;
1773
1774#ifdef FLA_ENABLE_MULTITHREADING
1775 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1776#endif
1777 }
1778
1779 // Move the block to the GPU.
1780 if ( transfer )
1781 FLASH_Queue_write_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
1782
1783 return;
1784}
FLA_Error FLASH_Queue_write_gpu(FLA_Obj obj, void *buffer_gpu)
Definition FLASH_Queue_gpu.c:185

References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_write_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, i, FLA_Obj_gpu_struct::obj, FLA_Obj_gpu_struct::request, and FLASH_Queue_variables::victim.

Referenced by FLASH_Queue_update_gpu().

◆ FLASH_Queue_update_cache()

void FLASH_Queue_update_cache ( FLASH_Task t,
void arg 
)
853{
854 int i, j;
856 FLA_Obj obj;
857
858 if ( t == NULL )
859 return;
860
861 // Updating the input blocks.
862 for ( i = t->n_input_args - 1; i >= 0; i-- )
863 {
864 // Check for duplicate blocks.
866
867 for ( j = 0; j < t->n_output_args && !duplicate; j++ )
868 {
869 if ( t->input_arg[i].base == t->output_arg[j].base )
870 duplicate = TRUE;
871 }
872
873 for ( j = 0; j < i && !duplicate; j++ )
874 {
875 if ( t->input_arg[i].base == t->input_arg[j].base )
876 duplicate = TRUE;
877 }
878
879 // If the input block has not been processed before.
880 if ( !duplicate )
881 {
882 obj = t->input_arg[i];
883
884 // Macroblock is used.
885 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
886 {
887 dim_t jj, kk;
888 dim_t m = FLA_Obj_length( obj );
889 dim_t n = FLA_Obj_width( obj );
890 dim_t cs = FLA_Obj_col_stride( obj );
891 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
892
893 // Dependence analysis for each input block in macroblock.
894 for ( jj = 0; jj < n; jj++ )
895 for ( kk = 0; kk < m; kk++ )
896 FLASH_Queue_update_cache_block( *( buf + jj * cs + kk ),
897 t->cache, FALSE, arg );
898 }
899 else // Regular block.
900 {
901 FLASH_Queue_update_cache_block( obj, t->cache, FALSE, arg );
902 }
903 }
904 }
905
906 // Updating the output blocks.
907 for ( i = t->n_output_args - 1; i >= 0; i-- )
908 {
909 // Check for duplicate blocks.
911
912 for ( j = 0; j < i && !duplicate; j++ )
913 {
914 if ( t->output_arg[i].base == t->output_arg[j].base )
915 duplicate = TRUE;
916 }
917
918 // If the output block has not been processed before.
919 if ( !duplicate )
920 {
921 obj = t->output_arg[i];
922
923 // Macroblock is used.
924 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
925 {
926 dim_t jj, kk;
927 dim_t m = FLA_Obj_length( obj );
928 dim_t n = FLA_Obj_width( obj );
929 dim_t cs = FLA_Obj_col_stride( obj );
930 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
931
932 // Dependence analysis for each input block in macroblock.
933 for ( jj = 0; jj < n; jj++ )
934 for ( kk = 0; kk < m; kk++ )
936 t->cache, TRUE, arg );
937 }
938 else // Regular block.
939 {
940 FLASH_Queue_update_cache_block( obj, t->cache, TRUE, arg );
941 }
942 }
943 }
944
945 return;
946}
void FLASH_Queue_update_cache_block(FLA_Obj obj, int cache, FLA_Bool output, void *arg)
Definition FLASH_Queue_exec.c:949

References FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_update_cache_block(), and i.

Referenced by FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().

◆ FLASH_Queue_update_cache_block()

void FLASH_Queue_update_cache_block ( FLA_Obj  obj,
int  cache,
FLA_Bool  output,
void arg 
)
958{
960 int i, j, k;
961 int n_caches = args->n_caches;
962 int size = args->size;
963
964#ifdef FLA_ENABLE_MULTITHREADING
965 FLA_Lock_acquire( &(args->cac_lock[cache]) ); // C ***
966#endif
967
968 // Locate the position of the block in the cache.
969 for ( k = 0; k < size - 1; k++ )
970 {
971 if ( obj.base == args->cache[cache * size + k].base )
972 break;
973 }
974
975 // Shift all the previous tasks for LRU replacement.
976 for ( j = k; j > 0; j-- )
977 args->cache[cache * size + j] = args->cache[cache * size + j - 1];
978
979 // Place the block on the cache as the most recently used.
980 args->cache[cache * size] = obj;
981
982#ifdef FLA_ENABLE_MULTITHREADING
983 FLA_Lock_release( &(args->cac_lock[cache]) ); // C ***
984#endif
985
986 // Write invalidate if updating with output block.
987 if ( output )
988 {
989 for ( i = 0; i < n_caches; i++ )
990 {
991 if ( i != cache )
992 {
993#ifdef FLA_ENABLE_MULTITHREADING
994 FLA_Lock_acquire( &(args->cac_lock[i]) ); // C ***
995#endif
996 // Locate the position of the block in the cache.
997 for ( k = 0; k < size; k++ )
998 {
999 if ( obj.base == args->cache[i * size + k].base )
1000 break;
1001 }
1002
1003 // The block is owned by other thread.
1004 if ( k < size )
1005 {
1006 // Shift all the blocks for the invalidated block.
1007 for ( j = k; j < size - 1; j++ )
1008 args->cache[i * size + j] = args->cache[i * size + j + 1];
1009
1010 // Invalidate the block.
1011 args->cache[i * size + size - 1].base = NULL;
1012 }
1013#ifdef FLA_ENABLE_MULTITHREADING
1014 FLA_Lock_release( &(args->cac_lock[i]) ); // C ***
1015#endif
1016 }
1017 }
1018 }
1019
1020 return;
1021}

References FLA_Obj_view::base, FLASH_Queue_variables::cac_lock, FLASH_Queue_variables::cache, FLA_Lock_acquire(), FLA_Lock_release(), i, FLASH_Queue_variables::n_caches, and FLASH_Queue_variables::size.

Referenced by FLASH_Queue_update_cache().

◆ FLASH_Queue_update_gpu()

void FLASH_Queue_update_gpu ( FLASH_Task t,
void **  input_arg,
void **  output_arg,
void arg 
)
1599{
1600 int i, j, k;
1601 int thread = t->thread;
1604
1605 // None of the arguments can be macroblocks yet.
1606 // Complicating factor is copying macroblock to contiguous memory on GPU.
1607
1608 // Bring the input arguments to the GPU.
1609 for ( i = t->n_input_args - 1; i >= 0; i-- )
1610 {
1611 // Check for duplicate blocks.
1612 duplicate = FALSE;
1613
1614 for ( j = 0; j < t->n_output_args && !duplicate; j++ )
1615 {
1616 if ( t->input_arg[i].base == t->output_arg[j].base )
1617 duplicate = TRUE;
1618 }
1619
1620 for ( j = 0; j < i && !duplicate; j++ )
1621 {
1622 if ( t->input_arg[i].base == t->input_arg[j].base )
1623 duplicate = TRUE;
1624 }
1625
1626 // If the input block has not been processed before.
1627 if ( !duplicate )
1628 {
1629 FLASH_Queue_update_block_gpu( t->input_arg[i], input_arg + i, thread, arg );
1630 }
1631 else
1632 {
1633 input_arg[i] = NULL;
1634 }
1635 }
1636
1637 // Bring the output arguments to the GPU.
1638 for ( i = t->n_output_args - 1; i >= 0; i-- )
1639 {
1640 // Check for duplicate blocks.
1641 duplicate = FALSE;
1642
1643 for ( j = 0; j < i && !duplicate; j++ )
1644 {
1645 if ( t->output_arg[i].base == t->output_arg[j].base )
1646 duplicate = TRUE;
1647 }
1648
1649 // If the output block has not been processed before.
1650 if ( !duplicate )
1651 {
1652 FLASH_Queue_update_block_gpu( t->output_arg[i], output_arg + i, thread, arg );
1653
1654 // Invalidate output blocks on all other GPUs.
1655 for ( k = 0; k < n_threads; k++ )
1656 if ( k != thread )
1657 FLASH_Queue_invalidate_block_gpu( t->output_arg[i], k, arg );
1658 }
1659 else
1660 {
1661 output_arg[i] = NULL;
1662 }
1663 }
1664
1665 // Check to see if there are any duplicates.
1666 for ( i = t->n_input_args - 1; i >= 0; i-- )
1667 {
1668 for ( j = 0; j < t->n_output_args && input_arg[i] == NULL; j++ )
1669 {
1670 if ( t->input_arg[i].base == t->output_arg[j].base )
1671 input_arg[i] = output_arg[j];
1672 }
1673
1674 for ( j = 0; j < i && input_arg[i] == NULL; j++ )
1675 {
1676 if ( t->input_arg[i].base == t->input_arg[j].base )
1677 input_arg[i] = input_arg[j];
1678 }
1679 }
1680
1681 // Check to see if there are any duplicates.
1682 for ( i = t->n_output_args - 1; i >= 0; i-- )
1683 {
1684 for ( j = 0; j < i && output_arg[i] == NULL; j++ )
1685 {
1686 if ( t->output_arg[i].base == t->output_arg[j].base )
1687 output_arg[i] = output_arg[j];
1688 }
1689 }
1690
1691 return;
1692}
void FLASH_Queue_update_block_gpu(FLA_Obj obj, void **buffer_gpu, int thread, void *arg)
Definition FLASH_Queue_exec.c:1695

References FLASH_Queue_get_num_threads(), FLASH_Queue_invalidate_block_gpu(), FLASH_Queue_update_block_gpu(), and i.

Referenced by FLASH_Queue_exec_gpu().

◆ FLASH_Queue_wait_dequeue()

FLASH_Task * FLASH_Queue_wait_dequeue ( int  queue,
int  cache,
void arg 
)
684{
686 FLASH_Task* t = NULL;
688
689#ifdef FLA_ENABLE_GPU
691#endif
692
693 if ( args->wait_queue[queue].n_tasks > 0 )
694 {
695 // Dequeue the first task.
696 t = args->wait_queue[queue].head;
697
698 if ( args->wait_queue[queue].n_tasks == 1 )
699 {
700 // Clear the queue of its only task.
701 args->wait_queue[queue].head = NULL;
702 args->wait_queue[queue].tail = NULL;
703 }
704 else
705 {
706 // Grab a new task if using cache affinity.
708 {
709 // Determine if using GPU or not.
710 if ( enabled )
711 {
712#ifdef FLA_ENABLE_GPU
713#ifdef FLA_ENABLE_MULTITHREADING
714 FLA_Lock_acquire( &(args->gpu_lock[cache]) ); // G ***
715#endif
716 // Find a task where the task has blocks currently in GPU.
717 t = FLASH_Queue_wait_dequeue_block( queue, cache, arg );
718
719#ifdef FLA_ENABLE_MULTITHREADING
720 FLA_Lock_release( &(args->gpu_lock[cache]) ); // G ***
721#endif
722#endif
723 }
724 else
725 {
726#ifdef FLA_ENABLE_MULTITHREADING
727 FLA_Lock_acquire( &(args->cac_lock[cache]) ); // C ***
728#endif
729 // Find a task where the task has blocks currently in cache.
730 t = FLASH_Queue_wait_dequeue_block( queue, cache, arg );
731
732#ifdef FLA_ENABLE_MULTITHREADING
733 FLA_Lock_release( &(args->cac_lock[cache]) ); // C ***
734#endif
735 }
736
737 // Adjust pointers if the task is head of waiting queue.
738 if ( t->prev_wait == NULL )
739 {
740 args->wait_queue[queue].head = t->next_wait;
741 args->wait_queue[queue].head->prev_wait = NULL;
742 }
743 else
744 {
746 }
747
748 // Adjust pointers if the task is tail of waiting queue.
749 if ( t->next_wait == NULL )
750 {
751 args->wait_queue[queue].tail = t->prev_wait;
752 args->wait_queue[queue].tail->next_wait = NULL;
753 }
754 else
755 {
757 }
758 }
759 else
760 {
761 // Adjust pointers in waiting queue.
762 args->wait_queue[queue].head = t->next_wait;
763 args->wait_queue[queue].head->prev_wait = NULL;
764 }
765 }
766
767 // Clear the task's waiting linked list pointers.
768 t->prev_wait = NULL;
769 t->next_wait = NULL;
770
771 // Decrement number of tasks on waiting queue.
772 args->wait_queue[queue].n_tasks--;
773 }
774
775 return t;
776}
FLASH_Task * FLASH_Queue_wait_dequeue_block(int queue, int cache, void *arg)
Definition FLASH_Queue_exec.c:779
FLASH_Task * prev_wait
Definition FLA_type_defs.h:240
FLASH_Task * next_wait
Definition FLA_type_defs.h:241

References FLASH_Queue_variables::cac_lock, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_caching(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_wait_dequeue_block(), FLASH_Queue_variables::gpu_lock, FLASH_Queue_s::head, i, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLASH_Task_s::next_wait, FLASH_Queue_variables::pc, FLASH_Task_s::prev_wait, FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), and FLASH_Task_update_dependencies().

◆ FLASH_Queue_wait_dequeue_block()

FLASH_Task * FLASH_Queue_wait_dequeue_block ( int  queue,
int  cache,
void arg 
)
785{
787 int i, j, k;
788 int size = args->size;
789 int n_tasks = args->wait_queue[queue].n_tasks;
791 FLASH_Task* t;
792 FLA_Obj obj;
793 FLA_Obj mem;
794
795#ifdef FLA_ENABLE_GPU
797
798 // If using GPUs, then only check GPU and not the cache.
799 if ( enabled )
801#endif
802
803 t = args->wait_queue[queue].head;
804
805 // Check if any of the output blocks are in the cache.
806 for ( i = 0; i < n_tasks; i++ )
807 {
808 for ( j = 0; j < size; j++ )
809 {
810 // Initialize the memory just in case.
811 mem.base = NULL;
812
813 // Determine if using GPU or not.
814 if ( enabled )
815 {
816#ifdef FLA_ENABLE_GPU
817 mem = args->gpu[cache * size + j].obj;
818#endif
819 }
820 else
821 {
822 mem = args->cache[cache * size + j];
823 }
824
825 for ( k = 0; k < t->n_output_args; k++ )
826 {
827 obj = t->output_arg[k];
828
829 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
830 obj = *FLASH_OBJ_PTR_AT( obj );
831
832 // Return the task if its output block is in cache.
833 if ( mem.base == obj.base )
834 {
835 t->hit = TRUE;
836 return t;
837 }
838 }
839 }
840 t = t->next_wait;
841 }
842
843 return args->wait_queue[queue].head;
844}

References FLA_Obj_view::base, FLASH_Queue_variables::cache, FLA_Obj_elemtype(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_s::head, i, FLASH_Queue_s::n_tasks, FLA_Obj_gpu_struct::obj, FLASH_Queue_variables::size, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_wait_dequeue().

◆ FLASH_Queue_wait_enqueue()

void FLASH_Queue_wait_enqueue ( FLASH_Task t,
void arg 
)
632{
634 int queue = t->queue;
635
636 if ( args->wait_queue[queue].n_tasks == 0 )
637 {
638 args->wait_queue[queue].head = t;
639 args->wait_queue[queue].tail = t;
640 }
641 else
642 {
643 t->prev_wait = args->wait_queue[queue].tail;
644
645 // Insertion sort of tasks in waiting queue.
647 {
648 while ( t->prev_wait != NULL )
649 {
650 if ( t->prev_wait->height >= t->height )
651 break;
652
655 }
656 }
657
658 // Checking if the task is the head of the waiting queue.
659 if ( t->prev_wait == NULL )
660 args->wait_queue[queue].head = t;
661 else
663
664 // Checking if the task is the tail of the waiting queue.
665 if ( t->next_wait == NULL )
666 args->wait_queue[queue].tail = t;
667 else
669 }
670
671 // Increment number of tasks on waiting queue.
672 args->wait_queue[queue].n_tasks++;
673
674 return;
675}
FLA_Bool FLASH_Queue_get_sorting(void)
Definition FLASH_Queue.c:332

References FLASH_Queue_get_sorting(), FLASH_Queue_s::head, FLASH_Task_s::height, i, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLASH_Task_s::next_wait, FLASH_Queue_variables::pc, FLASH_Task_s::prev_wait, FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_exec_gpu(), FLASH_Queue_exec_simulation(), FLASH_Queue_init_tasks(), FLASH_Task_update_binding(), and FLASH_Task_update_dependencies().

◆ FLASH_Queue_work_stealing()

FLASH_Task * FLASH_Queue_work_stealing ( int  queue,
void arg 
)
1161{
1163 int q;
1164 int n_queues = args->n_queues;
1165 FLASH_Task* t = NULL;
1166
1167 // Do not perform work stealing if there is only one queue.
1168 if ( n_queues == 1 )
1169 return t;
1170
1171 // Find a random queue not equal to the current queue.
1172 do
1173 {
1174#ifdef FLA_ENABLE_WINDOWS_BUILD
1175 rand_s( &q );
1176 q = q % n_queues;
1177#else
1178#ifdef FLA_ENABLE_TIDSP
1179 q = rand() % n_queues;
1180#else
1181 q = lrand48() % n_queues;
1182#endif
1183#endif
1184 }
1185 while ( q == queue );
1186
1187#ifdef FLA_ENABLE_MULTITHREADING
1188 FLA_Lock_acquire( &(args->run_lock[q]) ); // R ***
1189#endif
1190
1191 // If there are tasks that this thread can steal.
1192 if ( args->wait_queue[q].n_tasks > 0 )
1193 {
1194 // Dequeue the last task.
1195 t = args->wait_queue[q].tail;
1196
1197 if ( args->wait_queue[q].n_tasks == 1 )
1198 {
1199 // Clear the queue of its only task.
1200 args->wait_queue[q].head = NULL;
1201 args->wait_queue[q].tail = NULL;
1202 }
1203 else
1204 {
1205 // Adjust pointers in waiting queue.
1206 args->wait_queue[q].tail = t->prev_wait;
1207 args->wait_queue[q].tail->next_wait = NULL;
1208 }
1209
1210 // Reset waiting queue data about the stolen task.
1211 t->queue = queue;
1212 t->prev_wait = NULL;
1213 t->next_wait = NULL;
1214
1215 args->wait_queue[q].n_tasks--;
1216 }
1217
1218#ifdef FLA_ENABLE_MULTITHREADING
1219 FLA_Lock_release( &(args->run_lock[q]) ); // R ***
1220#endif
1221
1222 return t;
1223}
int queue
Definition FLA_type_defs.h:190

References FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_s::head, i, FLASH_Queue_variables::n_queues, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_wait, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, FLASH_Queue_variables::run_lock, FLASH_Queue_s::tail, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_exec_parallel_function().

◆ FLASH_Task_free_parallel()

void FLASH_Task_free_parallel ( FLASH_Task t,
void arg 
)
2452{
2454 int i, j, k;
2455 int thread;
2457 FLASH_Dep* d;
2458 FLASH_Dep* next_dep;
2459 FLA_Obj obj;
2460
2461 // Clearing the last write task in each output block.
2462 for ( i = 0; i < t->n_output_args; i++ )
2463 {
2464 obj = t->output_arg[i];
2465
2466 // Macroblock is used.
2467 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
2468 {
2469 dim_t jj, kk;
2470 dim_t m = FLA_Obj_length( obj );
2471 dim_t n = FLA_Obj_width( obj );
2472 dim_t cs = FLA_Obj_col_stride( obj );
2473 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
2474
2475 // Clear each block in macroblock.
2476 for ( jj = 0; jj < n; jj++ )
2477 for ( kk = 0; kk < m; kk++ )
2478 ( buf + jj * cs + kk )->base->write_task = NULL;
2479 }
2480 else // Clear regular block.
2481 {
2482 obj.base->write_task = NULL;
2483 }
2484 }
2485
2486 // Cleaning the last read tasks in each input block.
2487 for ( i = 0; i < t->n_input_args; i++ )
2488 {
2489 obj = t->input_arg[i];
2490
2491 // Macroblock is used.
2492 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
2493 {
2494 dim_t jj, kk;
2495 dim_t m = FLA_Obj_length( obj );
2496 dim_t n = FLA_Obj_width( obj );
2497 dim_t cs = FLA_Obj_col_stride( obj );
2498 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
2499
2500 // Clear each block in macroblock.
2501 for ( jj = 0; jj < n; jj++ )
2502 {
2503 for ( kk = 0; kk < m; kk++ )
2504 {
2505 obj = *( buf + jj * cs + kk );
2506
2507 thread = obj.base->n_read_blocks % n_threads;
2508
2509 FLA_Lock_acquire( &(args->war_lock[thread]) ); // W ***
2510
2511 k = obj.base->n_read_tasks;
2512 d = obj.base->read_task_head;
2513
2514 obj.base->n_read_tasks = 0;
2515 obj.base->read_task_head = NULL;
2516 obj.base->read_task_tail = NULL;
2517
2518 FLA_Lock_release( &(args->war_lock[thread]) ); // W ***
2519
2520 for ( j = 0; j < k; j++ )
2521 {
2522 next_dep = d->next_dep;
2523 FLA_free( d );
2524 d = next_dep;
2525 }
2526 }
2527 }
2528 }
2529 else // Regular block.
2530 {
2531 thread = obj.base->n_read_blocks % n_threads;
2532
2533 FLA_Lock_acquire( &(args->war_lock[thread]) ); // W ***
2534
2535 k = obj.base->n_read_tasks;
2536 d = obj.base->read_task_head;
2537
2538 obj.base->n_read_tasks = 0;
2539 obj.base->read_task_head = NULL;
2540 obj.base->read_task_tail = NULL;
2541
2542 FLA_Lock_release( &(args->war_lock[thread]) ); // W ***
2543
2544 for ( j = 0; j < k; j++ )
2545 {
2546 next_dep = d->next_dep;
2547 FLA_free( d );
2548 d = next_dep;
2549 }
2550 }
2551 }
2552
2553 // Free the dep_arg field of t.
2554 d = t->dep_arg_head;
2555
2556 for ( i = 0; i < t->n_dep_args; i++ )
2557 {
2558 next_dep = d->next_dep;
2559 FLA_free( d );
2560 d = next_dep;
2561 }
2562
2563 // Free the int_arg field of t.
2564 FLA_free( t->int_arg );
2565
2566 // Free the fla_arg field of t.
2567 FLA_free( t->fla_arg );
2568
2569 // Free the input_arg field of t.
2570 FLA_free( t->input_arg );
2571
2572 // Free the output_arg field of t.
2573 FLA_free( t->output_arg );
2574
2575 // Finally, free the struct itself.
2576 FLA_free( t );
2577
2578 return;
2579}
FLASH_Dep * read_task_tail
Definition FLA_type_defs.h:151
FLASH_Dep * read_task_head
Definition FLA_type_defs.h:150
int n_read_blocks
Definition FLA_type_defs.h:145
FLASH_Task * write_task
Definition FLA_type_defs.h:154
int n_read_tasks
Definition FLA_type_defs.h:149

References FLA_Obj_view::base, FLA_free(), FLA_Lock_acquire(), FLA_Lock_release(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_num_threads(), i, FLA_Obj_struct::n_read_blocks, FLA_Obj_struct::n_read_tasks, FLASH_Dep_s::next_dep, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, FLASH_Queue_variables::war_lock, and FLA_Obj_struct::write_task.

Referenced by FLASH_Queue_exec_parallel_function().

◆ FLASH_Task_update_binding()

FLASH_Task * FLASH_Task_update_binding ( FLASH_Task t,
FLASH_Task r,
void arg 
)
2401{
2403 int queue;
2404
2405 if ( r == NULL )
2406 {
2407 // There are no tasks on waiting queue, so bind the first task.
2408 r = t;
2409 r->hit = TRUE;
2410 }
2411 else
2412 {
2413 // Swap the binded task for the new ready task.
2414 if ( !r->hit || ( FLASH_Queue_get_sorting() && r->height < t->height ) )
2415 {
2416 queue = r->queue;
2417 r->hit = FALSE;
2418
2419 FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
2420
2421 // Place swapped task back onto waiting queue.
2423
2424 FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
2425
2426 // Bind the new ready task.
2427 r = t;
2428 r->hit = TRUE;
2429 }
2430 else // Keep the binded task and enqueue new ready task.
2431 {
2432 queue = t->queue;
2433
2434 FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
2435
2437
2438 FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
2439 }
2440 }
2441
2442 return r;
2443}
FLA_Bool hit
Definition FLA_type_defs.h:194

References FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_sorting(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::height, FLASH_Task_s::hit, i, FLASH_Task_s::queue, and FLASH_Queue_variables::run_lock.

Referenced by FLASH_Task_update_dependencies().

◆ FLASH_Task_update_dependencies()

FLASH_Task * FLASH_Task_update_dependencies ( FLASH_Task t,
void arg 
)
2322{
2324 int i;
2325 int q = t->queue;
2326 int queue;
2327 int thread;
2332 FLASH_Task* task;
2333 FLASH_Task* r = NULL;
2334 FLASH_Dep* d = t->dep_arg_head;
2335
2336 // Dequeue task to bind to thread if caching is enabled.
2337 if ( caching )
2338 {
2339 FLA_Lock_acquire( &(args->run_lock[q]) ); // R ***
2340
2341 // Obtain task to execute.
2342 r = FLASH_Queue_wait_dequeue( q, t->cache, arg );
2343
2344 FLA_Lock_release( &(args->run_lock[q]) ); // R ***
2345 }
2346
2347 // Check each dependent task.
2348 for ( i = 0; i < t->n_dep_args; i++ )
2349 {
2350 if ( stealing )
2351 {
2352 // Place all dependent tasks onto same queue as predecessor task.
2353 d->task->queue = q;
2354 }
2355
2356 task = d->task;
2357 queue = task->queue;
2358 thread = task->order % n_threads;
2359
2360 FLA_Lock_acquire( &(args->dep_lock[thread]) ); // D ***
2361
2362 task->n_ready--;
2363 available = ( task->n_ready == 0 );
2364
2365 FLA_Lock_release( &(args->dep_lock[thread]) ); // D ***
2366
2367 // Place newly ready tasks on waiting queue.
2368 if ( available )
2369 {
2370 // If caching is enabled and the task belongs to this thread's queue.
2371 if ( caching && q == queue )
2372 {
2373 // Determine if there is a new binded task.
2374 r = FLASH_Task_update_binding( task, r, arg );
2375 }
2376 else
2377 {
2378 FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
2379
2381
2382 FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
2383 }
2384 }
2385
2386 // Go to the next dep.
2387 d = d->next_dep;
2388 }
2389
2390 return r;
2391}
FLASH_Task * FLASH_Task_update_binding(FLASH_Task *t, FLASH_Task *r, void *arg)
Definition FLASH_Queue_exec.c:2394
int order
Definition FLA_type_defs.h:189

References FLASH_Queue_variables::dep_lock, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_caching(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_work_stealing(), FLASH_Queue_wait_dequeue(), FLASH_Queue_wait_enqueue(), FLASH_Task_update_binding(), i, FLASH_Task_s::n_ready, FLASH_Queue_variables::n_ready, FLASH_Dep_s::next_dep, FLASH_Task_s::order, FLASH_Task_s::queue, RCCE_acquire_lock(), RCCE_release_lock(), FLASH_Queue_variables::run_lock, and FLASH_Dep_s::task.

Referenced by FLASH_Queue_exec_parallel_function().

◆ RCCE_acquire_lock()

int RCCE_acquire_lock ( int  )

◆ RCCE_release_lock()

int RCCE_release_lock ( int  )

◆ RCCE_ue()

int RCCE_ue ( void  )

◆ RCCE_wtime()

double RCCE_wtime ( void  )

Referenced by FLASH_Queue_exec().

◆ Synch_all()

void Synch_all ( )

Referenced by FLASH_Queue_exec().