1010#define DISABLE_FREQUENT_EVENTS
1111#endif
1212
13+ #ifdef USE_NVTX
14+ #include <nvtx3/nvToolsExtPayload.h>
15+ #endif
16+
1317#ifdef __cplusplus
1418extern "C" {
1519#endif
@@ -57,7 +61,14 @@ static jl_mutex_t jl_timing_ittapi_events_lock;
5761#endif //USE_ITTAPI
5862
5963#ifdef USE_NVTX
60- static nvtxDomainHandle_t jl_timing_nvtx_domain ;
64+
65+ nvtxDomainHandle_t jl_timing_nvtx_domain ;
66+ nvtxDomainHandle_t jl_timing_nvtx_task_domain ;
67+
68+ static uint64_t jl_timing_nvtx_signature_schemaid ;
69+ static uint64_t jl_timing_nvtx_module_schemaid ;
70+ static uint64_t jl_timing_nvtx_location_schemaid ;
71+
6172#endif
6273
6374#ifdef USE_TIMING_COUNTS
@@ -155,6 +166,82 @@ void jl_init_timing(void)
155166 for (int i = 0 ; i < JL_TIMING_SUBSYSTEM_LAST ; i ++ ) {
156167 nvtxDomainNameCategoryA (jl_timing_nvtx_domain , i + 1 , jl_timing_subsystems [i ]);
157168 }
169+
170+ nvtxPayloadSchemaEntry_t jl_timing_nvtx_signature_schema_entries [] = {
171+ {
172+ .flags = NVTX_PAYLOAD_ENTRY_FLAG_UNUSED ,
173+ .type = NVTX_PAYLOAD_ENTRY_TYPE_NVTX_REGISTERED_STRING_HANDLE ,
174+ .name = "signature" ,
175+ .offset = 0 ,
176+ }
177+ };
178+
179+ nvtxPayloadSchemaAttr_t jl_timing_nvtx_signature_schema = {
180+ .fieldMask = NVTX_PAYLOAD_SCHEMA_ATTR_FIELD_TYPE | NVTX_PAYLOAD_SCHEMA_ATTR_FIELD_FLAGS | NVTX_PAYLOAD_SCHEMA_ATTR_FIELD_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_FIELD_NUM_ENTRIES ,
181+ .type = NVTX_PAYLOAD_SCHEMA_TYPE_STATIC ,
182+ .flags = NVTX_PAYLOAD_SCHEMA_FLAG_NONE ,
183+ .entries = jl_timing_nvtx_signature_schema_entries ,
184+ .numEntries = 1 ,
185+ .payloadStaticSize = sizeof (nvtxStringHandle_t ),
186+ };
187+ jl_timing_nvtx_signature_schemaid = nvtxPayloadSchemaRegister (jl_timing_nvtx_domain , & jl_timing_nvtx_signature_schema );
188+
189+
190+ nvtxPayloadSchemaEntry_t jl_timing_nvtx_module_schema_entries [] = {
191+ {
192+ .flags = NVTX_PAYLOAD_ENTRY_FLAG_UNUSED ,
193+ .type = NVTX_PAYLOAD_ENTRY_TYPE_NVTX_REGISTERED_STRING_HANDLE ,
194+ .name = "name" ,
195+ .offset = offsetof(jl_timing_nvtx_module_t , name ),
196+ },
197+ {
198+ .flags = NVTX_PAYLOAD_ENTRY_FLAG_UNUSED ,
199+ .type = NVTX_PAYLOAD_ENTRY_TYPE_NVTX_REGISTERED_STRING_HANDLE ,
200+ .name = "root" ,
201+ .offset = offsetof(jl_timing_nvtx_module_t , root ),
202+ },
203+ };
204+
205+ nvtxPayloadSchemaAttr_t jl_timing_nvtx_module_schema = {
206+ .fieldMask = NVTX_PAYLOAD_SCHEMA_ATTR_FIELD_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FIELD_TYPE | NVTX_PAYLOAD_SCHEMA_ATTR_FIELD_FLAGS | NVTX_PAYLOAD_SCHEMA_ATTR_FIELD_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_FIELD_NUM_ENTRIES ,
207+ .name = "module" ,
208+ .type = NVTX_PAYLOAD_SCHEMA_TYPE_STATIC ,
209+ .flags = NVTX_PAYLOAD_SCHEMA_FLAG_NONE ,
210+ .entries = jl_timing_nvtx_module_schema_entries ,
211+ .numEntries = sizeof (jl_timing_nvtx_module_schema_entries ) / sizeof (jl_timing_nvtx_module_schema_entries [0 ]),
212+ .payloadStaticSize = sizeof (jl_timing_nvtx_module_t ),
213+ };
214+ jl_timing_nvtx_module_schemaid = nvtxPayloadSchemaRegister (jl_timing_nvtx_domain , & jl_timing_nvtx_module_schema );
215+
216+ nvtxPayloadSchemaEntry_t jl_timing_nvtx_location_schema_entries [] = {
217+ {
218+ .flags = NVTX_PAYLOAD_ENTRY_FLAG_UNUSED ,
219+ .type = NVTX_PAYLOAD_ENTRY_TYPE_NVTX_REGISTERED_STRING_HANDLE ,
220+ .name = "file" ,
221+ .offset = offsetof(jl_timing_nvtx_location_t , file ),
222+ },
223+ {
224+ .flags = NVTX_PAYLOAD_ENTRY_FLAG_UNUSED ,
225+ .type = NVTX_PAYLOAD_ENTRY_TYPE_INT ,
226+ .name = "line" ,
227+ .offset = offsetof(jl_timing_nvtx_location_t , line ),
228+ }
229+ };
230+
231+ nvtxPayloadSchemaAttr_t jl_timing_nvtx_location_schema = {
232+ .fieldMask = NVTX_PAYLOAD_SCHEMA_ATTR_FIELD_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FIELD_TYPE | NVTX_PAYLOAD_SCHEMA_ATTR_FIELD_FLAGS | NVTX_PAYLOAD_SCHEMA_ATTR_FIELD_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_FIELD_NUM_ENTRIES ,
233+ .name = "location" ,
234+ .type = NVTX_PAYLOAD_SCHEMA_TYPE_STATIC ,
235+ .flags = NVTX_PAYLOAD_SCHEMA_FLAG_NONE ,
236+ .entries = jl_timing_nvtx_location_schema_entries ,
237+ .numEntries = sizeof (jl_timing_nvtx_location_schema_entries ) / sizeof (jl_timing_nvtx_location_schema_entries [0 ]),
238+ .payloadStaticSize = sizeof (jl_timing_nvtx_location_t ),
239+ };
240+ jl_timing_nvtx_location_schemaid = nvtxPayloadSchemaRegister (jl_timing_nvtx_domain , & jl_timing_nvtx_location_schema );
241+
242+
243+
244+ jl_timing_nvtx_task_domain = nvtxDomainCreateA ("julia tasks" );
158245#endif
159246
160247 int i __attribute__((unused )) = 0 ;
@@ -401,7 +488,18 @@ JL_DLLEXPORT void _jl_timing_block_end(jl_timing_block_t *block) {
401488 if (block -> is_running ) {
402489 uint64_t t = cycleclock (); (void )t ;
403490 _ITTAPI_STOP (block );
404- _NVTX_STOP (block );
491+ #ifdef USE_NVTX
492+ size_t nvtx_payload_count = 0 ;
493+ nvtxPayloadData_t nvtx_payloads [JL_NVTX_MAX_PAYLOADS ];
494+
495+ if (block -> nvtx_payload .flags & JL_NVTX_PAYLOAD_FLAG_SIGNATURE )
496+ nvtx_payloads [nvtx_payload_count ++ ] = (nvtxPayloadData_t ){jl_timing_nvtx_signature_schemaid , sizeof (nvtxStringHandle_t ), & block -> nvtx_payload .signature };
497+ if (block -> nvtx_payload .flags & JL_NVTX_PAYLOAD_FLAG_LOCATION )
498+ nvtx_payloads [nvtx_payload_count ++ ] = (nvtxPayloadData_t ){jl_timing_nvtx_location_schemaid , sizeof (jl_timing_nvtx_location_t ), & block -> nvtx_payload .location };
499+ if (block -> nvtx_payload .flags & JL_NVTX_PAYLOAD_FLAG_MODULE )
500+ nvtx_payloads [nvtx_payload_count ++ ] = (nvtxPayloadData_t ){jl_timing_nvtx_module_schemaid , sizeof (jl_timing_nvtx_module_t ), & block -> nvtx_payload .module };
501+ nvtxRangePopPayload (jl_timing_nvtx_domain , nvtx_payloads , nvtx_payload_count );
502+ #endif
405503 _TRACY_STOP (block -> tracy_ctx );
406504 _COUNTS_STOP (block , t );
407505
@@ -432,6 +530,9 @@ void jl_timing_block_task_enter(jl_task_t *ct, jl_ptls_t ptls, jl_timing_block_t
432530 }
433531 }
434532
533+ #ifdef USE_NVTX
534+ nvtxDomainRangePushEx (jl_timing_nvtx_task_domain , & ct -> nvtx_attrs );
535+ #endif
435536#ifdef USE_TRACY
436537 TracyCFiberEnter (ct -> name );
437538#else
@@ -441,6 +542,9 @@ void jl_timing_block_task_enter(jl_task_t *ct, jl_ptls_t ptls, jl_timing_block_t
441542
442543jl_timing_block_t * jl_timing_block_task_exit (jl_task_t * ct , jl_ptls_t ptls )
443544{
545+ #ifdef USE_NVTX
546+ nvtxDomainRangePop (jl_timing_nvtx_task_domain );
547+ #endif
444548#ifdef USE_TRACY
445549 // Tracy is fairly strict about not leaving a fiber that hasn't
446550 // been entered, which happens often when connecting to a running
@@ -483,15 +587,22 @@ JL_DLLEXPORT void jl_timing_show(jl_value_t *v, jl_timing_block_t *cur_block)
483587
484588JL_DLLEXPORT void jl_timing_show_module (jl_module_t * m , jl_timing_block_t * cur_block )
485589{
486- #ifdef USE_TRACY
590+ #if defined( USE_TRACY ) || defined( USE_NVTX )
487591 jl_module_t * root = jl_module_root (m );
592+ #endif
593+ #ifdef USE_TRACY
488594 if (root == m || root == jl_main_module ) {
489595 const char * module_name = jl_symbol_name (m -> name );
490596 TracyCZoneText (cur_block -> tracy_ctx , module_name , strlen (module_name ));
491597 } else {
492598 jl_timing_printf (cur_block , "%s.%s" , jl_symbol_name (root -> name ), jl_symbol_name (m -> name ));
493599 }
494600#endif
601+ #ifdef USE_NVTX
602+ cur_block -> nvtx_payload .module .name = nvtxDomainRegisterStringA (jl_timing_nvtx_domain , jl_symbol_name (m -> name ));
603+ cur_block -> nvtx_payload .module .root = nvtxDomainRegisterStringA (jl_timing_nvtx_domain , jl_symbol_name (root -> name ));
604+ cur_block -> nvtx_payload .flags |= JL_NVTX_PAYLOAD_FLAG_MODULE ;
605+ #endif
495606}
496607
497608JL_DLLEXPORT void jl_timing_show_filename (const char * path , jl_timing_block_t * cur_block )
@@ -500,12 +611,19 @@ JL_DLLEXPORT void jl_timing_show_filename(const char *path, jl_timing_block_t *c
500611 const char * filename = gnu_basename (path );
501612 TracyCZoneText (cur_block -> tracy_ctx , filename , strlen (filename ));
502613#endif
614+ #ifdef USE_NVTX
615+ cur_block -> nvtx_payload .location .file = nvtxDomainRegisterStringA (jl_timing_nvtx_domain , path );
616+ cur_block -> nvtx_payload .location .line = 0 ;
617+ cur_block -> nvtx_payload .flags |= JL_NVTX_PAYLOAD_FLAG_LOCATION ;
618+ #endif
503619}
504620
505621JL_DLLEXPORT void jl_timing_show_location (const char * file , int line , jl_module_t * mod , jl_timing_block_t * cur_block )
506622{
507- #ifdef USE_TRACY
623+ #if defined( USE_TRACY ) || defined( USE_NVTX )
508624 jl_module_t * root = jl_module_root (mod );
625+ #endif
626+ #ifdef USE_TRACY
509627 if (root == mod || root == jl_main_module ) {
510628 jl_timing_printf (cur_block , "%s:%d in %s" ,
511629 gnu_basename (file ),
@@ -520,6 +638,14 @@ JL_DLLEXPORT void jl_timing_show_location(const char *file, int line, jl_module_
520638 jl_symbol_name (mod -> name ));
521639 }
522640#endif
641+ #ifdef USE_NVTX
642+ cur_block -> nvtx_payload .location .file = nvtxDomainRegisterStringA (jl_timing_nvtx_domain , file );
643+ cur_block -> nvtx_payload .location .line = line ;
644+ cur_block -> nvtx_payload .module .name = nvtxDomainRegisterStringA (jl_timing_nvtx_domain , jl_symbol_name (mod -> name ));
645+ cur_block -> nvtx_payload .module .root = nvtxDomainRegisterStringA (jl_timing_nvtx_domain , jl_symbol_name (root -> name ));
646+ cur_block -> nvtx_payload .flags |= JL_NVTX_PAYLOAD_FLAG_LOCATION | JL_NVTX_PAYLOAD_FLAG_MODULE ;
647+ #endif
648+ #endif
523649}
524650
525651JL_DLLEXPORT void jl_timing_show_method_instance (jl_method_instance_t * mi , jl_timing_block_t * cur_block )
@@ -537,28 +663,45 @@ JL_DLLEXPORT void jl_timing_show_method_instance(jl_method_instance_t *mi, jl_ti
537663JL_DLLEXPORT void jl_timing_show_method (jl_method_t * method , jl_timing_block_t * cur_block )
538664{
539665 jl_timing_show ((jl_value_t * )method , cur_block );
666+ #ifdef USE_NVTX
667+
668+ #endif
540669 jl_timing_show_location (jl_symbol_name (method -> file ), method -> line , method -> module , cur_block );
541670}
542671
543672JL_DLLEXPORT void jl_timing_show_func_sig (jl_value_t * v , jl_timing_block_t * cur_block )
544673{
545- #ifdef USE_TRACY
674+ #if defined( USE_TRACY ) || defined( USE_NVTX )
546675 ios_t buf ;
547676 ios_mem (& buf , IOS_INLSIZE );
548677 buf .growable = 0 ; // Restrict to inline buffer to avoid allocation
549678
550679 jl_static_show_config_t config = { /* quiet */ 1 };
551680 jl_static_show_func_sig_ ((JL_STREAM * )& buf , v , config );
681+ #endif
682+ #ifdef USE_TRACY
552683 if (buf .size == buf .maxsize )
553684 memset (& buf .buf [IOS_INLSIZE - 3 ], '.' , 3 );
554-
555685 TracyCZoneText (cur_block -> tracy_ctx , buf .buf , buf .size );
556686#endif
687+ #ifdef USE_NVTX
688+ if (buf .size == buf .maxsize ) {
689+ memset (& buf .buf [IOS_INLSIZE - 4 ], '.' , 3 );
690+ memset (& buf .buf [buf .size - 1 ], 0 , 1 );
691+ } else {
692+ memset (& buf .buf [buf .size ], 0 , 1 );
693+ }
694+ cur_block -> nvtx_payload .signature = nvtxDomainRegisterStringA (jl_timing_nvtx_domain , buf .buf );
695+ cur_block -> nvtx_payload .flags |= JL_NVTX_PAYLOAD_FLAG_SIGNATURE ;
696+ #endif
557697}
558698
559699JL_DLLEXPORT void jl_timing_show_macro (jl_method_instance_t * macro , jl_value_t * lno , jl_module_t * mod , jl_timing_block_t * cur_block )
560700{
561701 jl_timing_printf (cur_block , "%s" , jl_symbol_name (macro -> def .method -> name ));
702+ #ifdef USE_NVTX
703+
704+ #endif
562705 assert (jl_typetagis (lno , jl_linenumbernode_type ));
563706 jl_timing_show_location (jl_symbol_name ((jl_sym_t * )jl_fieldref (lno , 1 )),
564707 jl_unbox_int64 (jl_fieldref (lno , 0 )),
@@ -593,7 +736,7 @@ JL_DLLEXPORT void jl_timing_puts(jl_timing_block_t *cur_block, const char *str)
593736
594737void jl_timing_task_init (jl_task_t * t )
595738{
596- #ifdef USE_TRACY
739+ #if defined( USE_TRACY ) || defined( USE_NVTX )
597740 jl_value_t * start_type = jl_typeof (t -> start );
598741 const char * start_name = "" ;
599742 if (jl_is_datatype (start_type ))
@@ -622,9 +765,26 @@ void jl_timing_task_init(jl_task_t *t)
622765 snprintf (fiber_name , fiber_name_len , "Task %d (\"%s\")" ,
623766 task_id ++ , start_name );
624767 }
625-
768+ #ifdef USE_TRACY
626769 t -> name = fiber_name ;
627770#endif
771+ #ifdef USE_NVTX
772+ nvtxEventAttributes_t nvtx_attrs = {0 };
773+ nvtx_attrs .version = NVTX_VERSION ;
774+ nvtx_attrs .size = NVTX_EVENT_ATTRIB_STRUCT_SIZE ;
775+
776+ nvtx_attrs .messageType = NVTX_MESSAGE_TYPE_REGISTERED ;
777+ nvtx_attrs .message .registered = nvtxDomainRegisterStringA (jl_timing_nvtx_task_domain , fiber_name );
778+
779+ // 0 is the default (unnamed) category
780+ nvtx_attrs .payloadType = NVTX_PAYLOAD_TYPE_UNSIGNED_INT64 ;
781+ nvtx_attrs .payload .ullValue = (uint64_t )t ; // cast pointer to uint64_t for identification
782+ // simple Knuth hash to get nice colors
783+ nvtx_attrs .colorType = NVTX_COLOR_ARGB ;
784+ nvtx_attrs .color = (((nvtx_attrs .payload .ullValue >> 32 ) + (nvtx_attrs .payload .ullValue & 0xffffffff )) * 2654435769 ) >> 8 ;
785+
786+ t -> nvtx_attrs = nvtx_attrs ;
787+ #endif
628788}
629789
630790JL_DLLEXPORT int jl_timing_set_enable (const char * subsystem , uint8_t enabled )
0 commit comments