Using SystemTap userspace static probes

One of the new features in glibc 2.19 was a set of SystemTap static probes in the malloc subsystem to allow a better view into its inner workings. SystemTap static probe points expand to only a single nop instruction when not enabled and take a fixed number of arguments which are passed to your SystemTap probe as arguments. I wanted to use these probes to analyze the performance of a malloc workload, so I wrote a SystemTap script to log events in the malloc subsystem.

To get this script to work on Fedora 20 I had to install the git version of SystemTap or some of the probes failed to parse
their arguments correctly. The script can be run like this:


# stap malloc.stp -c /usr/bin/ls

It’s also possible to run this script using non-installed version of glibc if you modify the globs in the script to match the path to your libc and run it with the appropriate library path:


# stap malloc.stp -c "env 'LD_LIBRARY_PATH=.../glibc-build:.../glibc-build/nptl' /usr/bin/ls"

The script is very simple and just prints a timestamp, the name of the probe point and the arguments but I hope someone will find it useful.


probe process("/lib*/libc.so.*").mark("memory_heap_new") {
printf("%d:memory_heap_new heap %x size %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_heap_more") {
printf("%d:memory_heap_more heap %x size %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_heap_less") {
printf("%d:memory_heap_less heap %x size %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_heap_free") {
printf("%d:memory_heap_free heap %x size %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_arena_new") {
printf("%d:memory_arena_new arena %x size %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_arena_reuse_free_list") {
printf("%d:memory_arena_reuse_free_list free_list %xn",
gettimeofday_ms(), $arg1)
}

probe process("/lib*/libc.so.*").mark("memory_arena_reuse_wait") {
printf("%d:memory_arena_reuse_wait mutex %d arena %x avoid_arena %xn",
gettimeofday_ms(), $arg1, $arg2, $arg3)
}

probe process("/lib*/libc.so.*").mark("memory_arena_reuse") {
printf("%d:memory_arena_reuse arena %x avoid_arena %xn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_arena_retry") {
printf("%d:memory_arena_retry arena %x bytes %dn",
gettimeofday_ms(), $arg2, $arg1)
}

probe process("/lib*/libc.so.*").mark("memory_sbrk_more") {
printf("%d:memory_sbrk_more brk %x change %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_sbrk_less") {
printf("%d:memory_sbrk_less brk %x change %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_malloc_retry") {
printf("%d:memory_malloc_retry bytes %dn",
gettimeofday_ms(), $arg1)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_free_dyn_thresholds") {
printf("%d:memory_mallopt_free_dyn_thresholds mmap %d trim %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_realloc_retry") {
printf("%d:memory_realloc_retry bytes %d oldmem %xn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_memalign_retry") {
printf("%d:memory_memalign_retry bytes %d alignment %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_calloc_retry") {
printf("%d:memory_calloc_retry bytes %dn",
gettimeofday_ms(), $arg1)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt") {
printf("%d:memory_mallopt param %d value %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_mxfast") {
printf("%d:memory_mallopt_mxfast new %d old %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_trim_threshold") {
printf("%d:memory_mallopt_trim_threshold new %d old %d dyn_threshold %dn",
gettimeofday_ms(), $arg1, $arg2, $arg3)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_top_pad") {
printf("%d:memory_mallopt_top_pad new %d old %d dyn_threshold %dn",
gettimeofday_ms(), $arg1, $arg2, $arg3)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_mmap_threshold") {
printf("%d:memory_mallopt_mmap_threshold new %d old %d dyn_threshold %dn",
gettimeofday_ms(), $arg1, $arg2, $arg3)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_mmap_max") {
printf("%d:memory_mallopt_mmap_max new %d old %d dyn_threshold %dn",
gettimeofday_ms(), $arg1, $arg2, $arg3)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_check_action") {
printf("%d:memory_mallopt_check_action new %d old %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_perturb") {
printf("%d:memory_mallopt_perturb new %d old %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_arena_test") {
printf("%d:memory_mallopt_arena_test new %d old %dn",
gettimeofday_ms(), $arg1, $arg2)
}

probe process("/lib*/libc.so.*").mark("memory_mallopt_arena_max") {
printf("%d:memory_mallopt_arena_max new %d old %dn",
gettimeofday_ms(), $arg1, $arg2)
}

setcontext and signal handlers

setcontext is a C library function that along with getcontext allows you to perform a non-local jump from one context to another. They are often used when implementing coroutines or custom threading libraries. longjmp and setjmp provide similar functionality but setcontext was an attempt to fix the shortcomings of these functions and standardize behaviour, although in POSIX 2008 the specification of setcontext and related functions were removed due to the difficulty of implementing them in a truly portable manner.

In the beginning there was setjmp and longjmp. setjmp would capture the current register values into a data structure and longjmp would restore those values at a later point in program execution, causing the control flow to jump back to the point where setjmp was called. This works fine unless the place where you are jumping from is a signal handler. In this case the problem you have is that setjmp will not restore the signal mask so the signal you were handling will not be unmasked. To fix this functions that saved and restored the signal mask called sigsetjmp and siglongjmp were introduced.

However, this doesn’t mean it is necessarily safe to jump out of a signal handler even if you are using siglongjmp. The problem you will often hit is that if the signal was delivered at an arbitrary point in program execution there may be locks or other global resource that need to be deallocated. The only way to avoid this is to block the appropriate signal across any part of the code that may not behave well when interrupted in this way. Unfortunately without auditing all third party libraries that probably means you can only enable handling of such a signal across very small regions of your program.

setcontext and getcontext also restore and save the signal mask and setcontext can also be used to exit from a signal handler with the caveats expressed above. However there is another way in which signal handling and setcontext interact. The context structure used by setcontext and getcontext is of type ucontext_t. If you installed your signal handler using sigaction and the sa_handler field of struct sigaction you get a third argument to the signal handler which is of type void * but allows casting to ucontext_t *. So what happens if you pass this structure to setcontext?

Well the answer is, on Linux at least, “probably nothing good”. The original definition of setcontext specified that “program execution continues with the program instruction following the instruction interrupted by the signal”, but more recent versions of the standards removed that requirement so the result is now unspecified. Some glibc ports such as powerpc, mips and tile do support restoring signal handler created contexts in the spirit of the original specification, but the rest, including x86 and ARM do not. As such it is not possible to rely on being able to restore a signal handler created context with setcontext on Linux. It would be interesting to know if any proprietary Unixes support restoring these contexts and if any applications actually use the functionality.

Debugging and profiling libtool binaries

libtool takes care of a lot of the details of building binaries and shared libraries for you, but one of the side effects of this is that until you install your binaries the binaries in your build directory are actually shell script wrappers. Running gdb or profiling tools on the shell script won’t work:

# gdb ./mybinary 
GNU gdb (GDB) Fedora 7.5.1-42.fc18
Copyright (C) 2012 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later 
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
...
"/home/will/mybinary": not in executable format: File format not recognized
(gdb)

However, you can get libtool to help you:

# libtool --mode=execute gdb ./mybinary
GNU gdb (GDB) Fedora 7.5.1-42.fc18
Copyright (C) 2012 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later 
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
...
Reading symbols from /home/will/.libs/lt-mybinary...done.
(gdb)

This trick will also work for other tools like gprof and perf.

A simple power law pseudo-random number generator

Today I needed some non-uniform random numbers. Most pseudo random number generators such as the rand(3) function return numbers that are drawn from a uniform distribution, so for example with rand(3) you have an approximately equal chance of getting and number between 0 and RAND_MAX. However in many real world situations some numbers occur much more frequently than others. For example, on average most programs make many more small memory allocations than large ones. Graphing the distribution of allocation sizes you get something that looks like a power law distribution. I don’t know if allocations always follow this type of pattern in all applications or whether it is in fact a true power law distribution, but it seems like a reasonable approximation.

So how can we generate random numbers that are drawn from a power law distribution? The easiest way is to use the standard uniform random number generator and transform the results. Below is some code in C that does that. It looks quite complex but gcc handles the constant propagation and the generated code is quite simple with only a single call to drand48.

#include <math.h>
#include <stdlib.h>

#define MIN_ALLOCATION_SIZE	4
#define MAX_ALLOCATION_SIZE	8192

size_t
get_block_size (void)
{
  /* Inverse square.  */
  double exponent = -2;
  /* Minimum value of distribution.  */
  double dist_min = MIN_ALLOCATION_SIZE;
  /* Maximum value of distribution.  */
  double dist_max = MAX_ALLOCATION_SIZE;

  double min_pow = pow (dist_min, exponent + 1);
  double max_pow = pow (dist_max, exponent + 1);

  return (size_t) pow((max_pow - min_pow) * drand48 ()
                      + min_pow, 1 / (exponent + 1));
}

How not to do indirect calls on ARM

The Thumb2 instruction encoding can reduce your code size by up to 30%, so it is nice if you’re writing assembler code to use the unified syntax which will allow your code to be encoded as either ARM or Thumb2 seamlessly. Or almost seamlessly – even though the unified syntax is the same for ARM and Thumb2 encodings, there are still some things you have to bear in mind when writing code for Thumb2.

For example, the following code implements an indirect call:

       mov     lr, pc         @ Save return address in lr
       ldr     pc, [sp], #8   @ Load function address

This code will not work correctly on Thumb2. Switching mode from ARM to Thumb happens when an address with the bottom bit set is jumped to, this can be by a branch, an ldr or a mov. However the pc does not store the Thumb mode bit, so moving from the pc as we do in the first instruction above will not capture the information that this is a return to a Thumb function. A better sequence would be:

       ldr     lr, [sp], #8   @ Load function address
       blx     lr             @ Call function

The blx instruction is only available on ARMv5 and above however, so if you need portability to historical ARM cores you may need to do a bit more work.

Debugging check unit tests with gdb

check is a unit testing framework for C code that provides a number of neat features, including running tests after a fork in order to catch any fatal signals that may be caused by a broken test. Unfortunately this can have the side effect of making debugging a failing test with gdb more painful than it needs to be.

There’s a simple way to get around this however – set the CK_FORK environment variable to disable the fork before running tests:

$ gdb my_check_test 
GNU gdb (GDB) Fedora (7.5.1-38.fc18)
Copyright (C) 2012 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later 
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
...
Reading symbols from /tmp/my_check_test...done.
(gdb) set environment CK_FORK=no
(gdb) run

Simple lock-free ring buffer implementation

Recently I was looking for some code to pass buffers between two threads in a memory allocator benchmark. The code had to be low overhead enough to ensure that it wouldn’t obscure the performance characteristics of the code being benchmarked, so I was looking for a data structure that was lock-free and reasonably simple and fast. It also had to avoid calling the allocator at any point.

The code below is what I came up with – it’s a blocking, single producer, single consumer lock-free ring buffer FIFO queue.

/*
 * Copyright (c) 2013, Linaro Limited
 *
 * Licensed under the 3 clause BSD license.
 */

#ifndef __LFQ_H__
#define __LFQ_H__

#include 
#include 

struct lfq {
  volatile unsigned int producer;
  volatile unsigned int consumer;
  size_t queue_size;
  void **data;
};

/* Initialize a struct lfq with a size and array of memory. */
static inline void lfq_init(struct lfq *q, size_t queue_size,
			    void **data)
{
  q->queue_size = queue_size;
  q->data = data;
  q->producer = 0;
  q->consumer = 0;
}

/* Internal function to backoff on contention. */
static inline void __lfq_backoff(void)
{
  sched_yield();
}

/* Dequeue an item from the ring. Spin waiting if it is empty. */
static inline void *lfq_dequeue(struct lfq *q)
{
  void *ret;
  while (q->producer == q->consumer)
    __lfq_backoff();
  ret = q->data[q->consumer % q->queue_size];
  q->consumer++;
  return ret;
}

/* Enqueue an item onto the ring. Spin waiting if it is full. */
static inline void lfq_queue(struct lfq *q, void *item)
{
  while (q->producer - q->consumer >= q->queue_size)
    __lfq_backoff();
  q->data[q->producer % q->queue_size] = item;
  __sync_synchronize();
  q->producer++;
}

/* Test is the queue is empty. */
static inline bool lfq_empty(struct lfq *q)
{
  return q->producer == q->consumer;
}
#endif