Using SystemTap userspace static probes

One of the new features in glibc 2.19 was a set of SystemTap static probes in the malloc subsystem to allow a better view into its inner workings. SystemTap static probe points expand to only a single nop instruction when not enabled and take a fixed number of arguments which are passed to your SystemTap probe as arguments. I wanted to use these probes to analyze the performance of a malloc workload, so I wrote a SystemTap script to log events in the malloc subsystem.

To get this script to work on Fedora 20 I had to install the git version of SystemTap or some of the probes failed to parse
their arguments correctly. The script can be run like this:


# stap malloc.stp -c /usr/bin/ls

It’s also possible to run this script using non-installed version of glibc if you modify the globs in the script to match the path to your libc and run it with the appropriate library path:


# stap malloc.stp -c "env 'LD_LIBRARY_PATH=.../glibc-build:.../glibc-build/nptl' /usr/bin/ls"

The script is very simple and just prints a timestamp, the name of the probe point and the arguments but I hope someone will find it useful.


probe process("/lib*/libc.so.*").mark("memory_heap_new") {
printf("%d:memory_heap_new heap %x size %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_heap_more") {
printf("%d:memory_heap_more heap %x size %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_heap_less") {
printf("%d:memory_heap_less heap %x size %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_heap_free") {
printf("%d:memory_heap_free heap %x size %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_arena_new") {
printf("%d:memory_arena_new arena %x size %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_arena_reuse_free_list") {
printf("%d:memory_arena_reuse_free_list free_list %xn",
gettimeofday_ms(), $arg1)
}

probe process("/lib*/libc.so.*").mark("memory_arena_reuse_wait") {
printf("%d:memory_arena_reuse_wait mutex %d arena %x avoid_arena %xn",
gettimeofday_ms(), $arg1, $arg2, $arg3)
}

probe process("/lib*/libc.so.*").mark("memory_arena_reuse") {
printf("%d:memory_arena_reuse arena %x avoid_arena %xn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_arena_retry") {
printf("%d:memory_arena_retry arena %x bytes %dn",
gettimeofday_ms(), $arg2, $arg1)
}

probe process("/lib*/libc.so.*").mark("memory_sbrk_more") {
printf("%d:memory_sbrk_more brk %x change %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_sbrk_less") {
printf("%d:memory_sbrk_less brk %x change %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_malloc_retry") {
printf("%d:memory_malloc_retry bytes %dn",
gettimeofday_ms(), $arg1)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_free_dyn_thresholds") {
printf("%d:memory_mallopt_free_dyn_thresholds mmap %d trim %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_realloc_retry") {
printf("%d:memory_realloc_retry bytes %d oldmem %xn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_memalign_retry") {
printf("%d:memory_memalign_retry bytes %d alignment %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_calloc_retry") {
printf("%d:memory_calloc_retry bytes %dn",
gettimeofday_ms(), $arg1)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt") {
printf("%d:memory_mallopt param %d value %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_mxfast") {
printf("%d:memory_mallopt_mxfast new %d old %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_trim_threshold") {
printf("%d:memory_mallopt_trim_threshold new %d old %d dyn_threshold %dn",
gettimeofday_ms(), $arg1, $arg2, $arg3)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_top_pad") {
printf("%d:memory_mallopt_top_pad new %d old %d dyn_threshold %dn",
gettimeofday_ms(), $arg1, $arg2, $arg3)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_mmap_threshold") {
printf("%d:memory_mallopt_mmap_threshold new %d old %d dyn_threshold %dn",
gettimeofday_ms(), $arg1, $arg2, $arg3)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_mmap_max") {
printf("%d:memory_mallopt_mmap_max new %d old %d dyn_threshold %dn",
gettimeofday_ms(), $arg1, $arg2, $arg3)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_check_action") {
printf("%d:memory_mallopt_check_action new %d old %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_perturb") {
printf("%d:memory_mallopt_perturb new %d old %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_arena_test") {
printf("%d:memory_mallopt_arena_test new %d old %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_arena_max") {
printf("%d:memory_mallopt_arena_max new %d old %dn",
gettimeofday_ms(), $arg1, $arg2)
}

calloc versus malloc and memset

The standard C library provides two ways to allocate memory from the heap, calloc and malloc. They differ superficially in their arguments but more fundamentally in the guarantees they provide about the memory they return – calloc promises to fill any memory returned with zeros but malloc does not.

It would appear that the following two code snippets are equivalent:

/* Allocating with calloc. */
void *ptr = calloc(1024, 1024);
/* Allocating with malloc. */
void *ptr = malloc(1024*1024);
memset(ptr, 0, 1024*1024);

Functionally the two snippets of code are the same – they allocate 1MB of memory full of zeros from the heap – but in one subtle way they may not be.

The standard allocator on Linux is ptmalloc, part of glibc, which for large allocations of 128kB or above may use the mmap system call to allocate memory. The mmap system call returns a mapping of zeroed pages, and ptmalloc avoids calling memset for calloc allocations made with mmap to avoid the overhead involved. So what does this mean?

In order to return an allocation of zeroed pages the kernel plays a trick. Instead of allocating however many pages are required and writing zeros into them, it allocates a single page filled with zeroes, and every time it needs a page full of zeros it maps the same one.

zero_page_mapping

Only one page is allocated initially and subsequent pages are allocated as needed when they are written to (copy on write), saving memory and allowing the allocation to be completed quickly. The allocation starts out looking like the picture on the left, and then as pages are written to would eventually end up looking like the picture on the right.

Normally this behaviour is beneficial to the performance and memory usage of your system however it caught me out when running some code to benchmark memory copy routines. Allocating a source buffer for a copy benchmark with calloc is not the same as using initialized memory, the zero page is a single physical page, so a physically tagged cache can quite comfortably contain the whole of your source buffer even if the buffer you allocated was much larger than the cache.

So in special situations like benchmarking it is best to think carefully before using calloc, but in general it can be a useful tool to improve the memory usage and performance of your code.