Using SystemTap userspace static probes

One of the new features in glibc 2.19 was a set of SystemTap static probes in the malloc subsystem to allow a better view into its inner workings. SystemTap static probe points expand to only a single nop instruction when not enabled and take a fixed number of arguments which are passed to your SystemTap probe as arguments. I wanted to use these probes to analyze the performance of a malloc workload, so I wrote a SystemTap script to log events in the malloc subsystem.

To get this script to work on Fedora 20 I had to install the git version of SystemTap or some of the probes failed to parse
their arguments correctly. The script can be run like this:


# stap malloc.stp -c /usr/bin/ls

It’s also possible to run this script using non-installed version of glibc if you modify the globs in the script to match the path to your libc and run it with the appropriate library path:


# stap malloc.stp -c "env 'LD_LIBRARY_PATH=.../glibc-build:.../glibc-build/nptl' /usr/bin/ls"

The script is very simple and just prints a timestamp, the name of the probe point and the arguments but I hope someone will find it useful.


probe process("/lib*/libc.so.*").mark("memory_heap_new") {
printf("%d:memory_heap_new heap %x size %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_heap_more") {
printf("%d:memory_heap_more heap %x size %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_heap_less") {
printf("%d:memory_heap_less heap %x size %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_heap_free") {
printf("%d:memory_heap_free heap %x size %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_arena_new") {
printf("%d:memory_arena_new arena %x size %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_arena_reuse_free_list") {
printf("%d:memory_arena_reuse_free_list free_list %xn",
gettimeofday_ms(), $arg1)
}

probe process("/lib*/libc.so.*").mark("memory_arena_reuse_wait") {
printf("%d:memory_arena_reuse_wait mutex %d arena %x avoid_arena %xn",
gettimeofday_ms(), $arg1, $arg2, $arg3)
}

probe process("/lib*/libc.so.*").mark("memory_arena_reuse") {
printf("%d:memory_arena_reuse arena %x avoid_arena %xn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_arena_retry") {
printf("%d:memory_arena_retry arena %x bytes %dn",
gettimeofday_ms(), $arg2, $arg1)
}

probe process("/lib*/libc.so.*").mark("memory_sbrk_more") {
printf("%d:memory_sbrk_more brk %x change %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_sbrk_less") {
printf("%d:memory_sbrk_less brk %x change %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_malloc_retry") {
printf("%d:memory_malloc_retry bytes %dn",
gettimeofday_ms(), $arg1)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_free_dyn_thresholds") {
printf("%d:memory_mallopt_free_dyn_thresholds mmap %d trim %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_realloc_retry") {
printf("%d:memory_realloc_retry bytes %d oldmem %xn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_memalign_retry") {
printf("%d:memory_memalign_retry bytes %d alignment %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_calloc_retry") {
printf("%d:memory_calloc_retry bytes %dn",
gettimeofday_ms(), $arg1)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt") {
printf("%d:memory_mallopt param %d value %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_mxfast") {
printf("%d:memory_mallopt_mxfast new %d old %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_trim_threshold") {
printf("%d:memory_mallopt_trim_threshold new %d old %d dyn_threshold %dn",
gettimeofday_ms(), $arg1, $arg2, $arg3)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_top_pad") {
printf("%d:memory_mallopt_top_pad new %d old %d dyn_threshold %dn",
gettimeofday_ms(), $arg1, $arg2, $arg3)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_mmap_threshold") {
printf("%d:memory_mallopt_mmap_threshold new %d old %d dyn_threshold %dn",
gettimeofday_ms(), $arg1, $arg2, $arg3)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_mmap_max") {
printf("%d:memory_mallopt_mmap_max new %d old %d dyn_threshold %dn",
gettimeofday_ms(), $arg1, $arg2, $arg3)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_check_action") {
printf("%d:memory_mallopt_check_action new %d old %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_perturb") {
printf("%d:memory_mallopt_perturb new %d old %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_arena_test") {
printf("%d:memory_mallopt_arena_test new %d old %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_arena_max") {
printf("%d:memory_mallopt_arena_max new %d old %dn",
gettimeofday_ms(), $arg1, $arg2)
}

setcontext and signal handlers

setcontext is a C library function that along with getcontext allows you to perform a non-local jump from one context to another. They are often used when implementing coroutines or custom threading libraries. longjmp and setjmp provide similar functionality but setcontext was an attempt to fix the shortcomings of these functions and standardize behaviour, although in POSIX 2008 the specification of setcontext and related functions were removed due to the difficulty of implementing them in a truly portable manner.

In the beginning there was setjmp and longjmp. setjmp would capture the current register values into a data structure and longjmp would restore those values at a later point in program execution, causing the control flow to jump back to the point where setjmp was called. This works fine unless the place where you are jumping from is a signal handler. In this case the problem you have is that setjmp will not restore the signal mask so the signal you were handling will not be unmasked. To fix this functions that saved and restored the signal mask called sigsetjmp and siglongjmp were introduced.

However, this doesn’t mean it is necessarily safe to jump out of a signal handler even if you are using siglongjmp. The problem you will often hit is that if the signal was delivered at an arbitrary point in program execution there may be locks or other global resource that need to be deallocated. The only way to avoid this is to block the appropriate signal across any part of the code that may not behave well when interrupted in this way. Unfortunately without auditing all third party libraries that probably means you can only enable handling of such a signal across very small regions of your program.

setcontext and getcontext also restore and save the signal mask and setcontext can also be used to exit from a signal handler with the caveats expressed above. However there is another way in which signal handling and setcontext interact. The context structure used by setcontext and getcontext is of type ucontext_t. If you installed your signal handler using sigaction and the sa_handler field of struct sigaction you get a third argument to the signal handler which is of type void * but allows casting to ucontext_t *. So what happens if you pass this structure to setcontext?

Well the answer is, on Linux at least, “probably nothing good”. The original definition of setcontext specified that “program execution continues with the program instruction following the instruction interrupted by the signal”, but more recent versions of the standards removed that requirement so the result is now unspecified. Some glibc ports such as powerpc, mips and tile do support restoring signal handler created contexts in the spirit of the original specification, but the rest, including x86 and ARM do not. As such it is not possible to rely on being able to restore a signal handler created context with setcontext on Linux. It would be interesting to know if any proprietary Unixes support restoring these contexts and if any applications actually use the functionality.

Using GNU indirect functions

GNU indirect functions are an extension of ELF that allows you to make a decision about which implementation of a function to use at link time. This allows you to choose the fastest code for a particular processor, which is very useful if you’re shipping your code as binaries. It involves an ELF symbol type (STT_GNU_IFUNC) and a new dynamic relocation (R_*_IRELATIVE) which have been added to the GNU ld and gold linkers and the glibc dynamic linker. Support exists for arm, powerpc, s390, sparc, i386 and x86_64 with aarch64 support coming soon.

Indirect functions aren’t used very widely at the moment, as far as I can tell only glibc uses them at the moment for things like string functions, but they can be used by any other application or library. I’ll try and explain with a short example how they can be used.

#include 
#include  /* Needed for HWCAP definitions. */

static void func1_neon(void)
{
	printf("NEON implementationn");
}

static void func1_vfp(void)
{
	printf("VFP implementationn");
}

static void func1_arm(void)
{
	printf("ARM implementationn");
}

/* This line makes func1 a GNU indirect function. */
asm (".type func1, %gnu_indirect_function");

void *func1(unsigned long int hwcap)
{
	if (hwcap & HWCAP_ARM_NEON)
		return &func1_neon;
	else if (hwcap & HWCAP_ARM_VFP)
		return &func1_vfp;
	else
		return &func1_arm;
}

This code defines a resolver function, func1, which returns a pointer to a function implementation based on the value of hwcap that is passed to it. The value of hwcap is architecture specific and on some architectures, like x86_64, it is not passed at all, and you have to determine hardware capabilities by hand.

int main(void)
{
	func1();
	return 0;
}

This main function must be defined in a separate file to prevent the compiler from getting the wrong type for func1. Somewhat confusingly the function called func1 from the caller’s perspective is of the type of its implementations rather than of the type of the indirect function. All indirect functions have the same prototype – they take an optional hwcap argument and return a pointer to a function.

Build these two pieces of code and link them together:


# gcc -O2 ifunc.c -c
# gcc -O2 main.c -c
# gcc main.o ifunc.o -o ifunc
# ./ifunc
NEON implementation
#

This is the result I get on my ARM Chromebook, your result will vary depending on which hardware you actually have. So why is this better than the alternatives? You could dispatch to the optimized routine dynamically yourself, but that adds an extra comparison and indirect call to every call to the function and you will also have to find the hardware capabilities yourself, which can be tricky. Alternatively you could build optimized libraries for every bit of hardware you support, but this takes up a lot of disk space and takes more significant infrastructure to dynamically open the correct library.

Indirect functions are not without their downsides however. They are currently supported only on Linux with the GNU toolchain and only on a subset of Linux supported architectures, and in some cases the tools are not very well tested yet – for example, at the moment this will only work correctly on ARM with the as yet unreleased glibc 2.18 due to a bug in the dynamic linker – but they are a powerful tool that deserves to be more widely used.