glibc-malloc源码阅读

2022-03-25 约 3387 字预计阅读 7 分钟

摘要: 作者通过分析glibc malloc源码，阐述了malloc的内存管理机制，包括其与mmap的关系、内存池、fastbin、smallbin、unsorted_bin等结构的作用，以及tcache的缓存机制，强调malloc在用户态尽量减少系统调用、减少碎片、按线程管理内存、支持大块和小块内存的分配与合并等特性，揭示了malloc在多线程环境下的高效和复杂的内存调度策略。 (评价: B)

在上一篇中，我们了解到了mmap的一些性质和基本原理：

分配的是虚拟内存
通过红黑树找到目标内存块，并且通过红黑树管理
分配的内存可能比实际需要的大
是阻塞的
会有内存对齐

本篇将了解到malloc和mmap的关系，以及解答我在mmap学习中产生的一个疑问：mmap会不会导致内存碎片？（因为mmap的内存看起来是被内核维护的红黑树管理了，所以我理解不会存在内存碎片。）

概述

以下是malloc.c中关于malloc的一小段概述：

1
2
3
4
5
6
7
8
9


  The main properties of the algorithms are:
  * For large (>= 512 bytes) requests, it is a pure best-fit allocator,
    with ties normally decided via FIFO (i.e. least recently used).
  * For small (<= 64 bytes by default) requests, it is a caching
    allocator, that maintains pools of quickly recycled chunks.
  * In between, and for combinations of large and small requests, it does
    the best it can trying to meet both goals at once.
  * For very large requests (>= 128KB by default), it relies on system
    memory mapping facilities, if supported.

有些地方可能不理解，先看代码吧～

__libc_malloc

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48


void *
__libc_malloc (size_t bytes)
{
  mstate ar_ptr;
  void *victim;
  void *(*hook) (size_t, const void *)
    = atomic_forced_read (__malloc_hook);
  if (__builtin_expect (hook != NULL, 0))
    return (*hook)(bytes, RETURN_ADDRESS (0));
#if USE_TCACHE
  /* int_free also calls request2size, be careful to not pad twice.  */
  size_t tbytes;
  checked_request2size (bytes, tbytes);
  size_t tc_idx = csize2tidx (tbytes);
  MAYBE_INIT_TCACHE ();
  DIAG_PUSH_NEEDS_COMMENT;
  if (tc_idx < mp_.tcache_bins
      /*&& tc_idx < TCACHE_MAX_BINS*/ /* to appease gcc */
      && tcache
      && tcache->entries[tc_idx] != NULL)
    {
      return tcache_get (tc_idx);
    }
  DIAG_POP_NEEDS_COMMENT;
#endif
  if (SINGLE_THREAD_P)
    {
      victim = _int_malloc (&main_arena, bytes);
      assert (!victim || chunk_is_mmapped (mem2chunk (victim)) ||
              &main_arena == arena_for_chunk (mem2chunk (victim)));
      return victim;
    }
  arena_get (ar_ptr, bytes);
  victim = _int_malloc (ar_ptr, bytes);
  /* Retry with another arena only if we were able to find a usable arena
     before.  */
  if (!victim && ar_ptr != NULL)
    {
      LIBC_PROBE (memory_malloc_retry, 1, bytes);
      ar_ptr = arena_get_retry (ar_ptr, bytes);
      victim = _int_malloc (ar_ptr, bytes);
    }
  if (ar_ptr != NULL)
    __libc_lock_unlock (ar_ptr->mutex);
  assert (!victim || chunk_is_mmapped (mem2chunk (victim)) ||
          ar_ptr == arena_for_chunk (mem2chunk (victim)));
  return victim;
}

hook

以上第一段是调用hook函数，也就是说malloc默认是提供hook接口的，只要实现了hook接口，malloc就可以变成对应的hook函数。

1
2
3
4


  void *(*hook) (size_t, const void *)
    = atomic_forced_read (__malloc_hook);
  if (__builtin_expect (hook != NULL, 0))
    return (*hook)(bytes, RETURN_ADDRESS (0));

__malloc_hook的默认值是指向的malloc_hook_ini，如下，malloc_hook_ini会给__malloc_hook赋空，所以malloc_hook_ini可以相当于就是__libc_malloc：

1
2
3
4
5
6
7


static void *
malloc_hook_ini (size_t sz, const void *caller)
{
  __malloc_hook = NULL;
  ptmalloc_init ();
  return __libc_malloc (sz);
}

tcache

第二部分是tcache，暂且不用过分追究（因为内容比较多，虽然目前malloc可能依赖于tcache，但是即使不学习tcache也不太影响对malloc的理解）：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16


#if USE_TCACHE
  /* int_free also calls request2size, be careful to not pad twice.  */
  size_t tbytes;
  checked_request2size (bytes, tbytes);
  size_t tc_idx = csize2tidx (tbytes);
  MAYBE_INIT_TCACHE ();
  DIAG_PUSH_NEEDS_COMMENT;
  if (tc_idx < mp_.tcache_bins
      /*&& tc_idx < TCACHE_MAX_BINS*/ /* to appease gcc */
      && tcache
      && tcache->entries[tc_idx] != NULL)
    {
      return tcache_get (tc_idx);
    }
  DIAG_POP_NEEDS_COMMENT;
#endif

两个宏如下：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15


#define request2size(req)                                         \
  (((req) + SIZE_SZ + MALLOC_ALIGN_MASK < MINSIZE)  ?             \
   MINSIZE :                                                      \
   ((req) + SIZE_SZ + MALLOC_ALIGN_MASK) & ~MALLOC_ALIGN_MASK)

#define checked_request2size(req, sz) \
({                                    \
  (sz) = request2size (req);            \
  if (((sz) < (req))                    \
      || REQUEST_OUT_OF_RANGE (sz)) \
    {                                    \
      __set_errno (ENOMEM);            \
      return 0;                            \
    }                                    \
})

如上，先是内存对齐，得到tbytes，然后根据tbytes计算出来一个下标tc_idx，然后就可以根据下标去tcache里面找了。

tcache的基本定义如下：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20


/* We overlay this structure on the user-data portion of a chunk when
   the chunk is stored in the per-thread cache.  */
typedef struct tcache_entry
{
  struct tcache_entry *next;
  /* This field exists to detect double frees.  */
  struct tcache_perthread_struct *key;
} tcache_entry;
/* There is one of these for each thread, which contains the
   per-thread cache (hence "tcache_perthread_struct").  Keeping
   overall size low is mildly important.  Note that COUNTS and ENTRIES
   are redundant (we could have just counted the linked list each
   time), this is for performance reasons.  */
typedef struct tcache_perthread_struct
{
  char counts[TCACHE_MAX_BINS];
  tcache_entry *entries[TCACHE_MAX_BINS];
} tcache_perthread_struct;
static __thread bool tcache_shutting_down = false;
static __thread tcache_perthread_struct *tcache = NULL;

可以知道几点：

tcache是每个线程维护的
tcache是一个链表结构

在malloc中，通过tcache_get接口从tcache中获取了一块buffer，实现如下：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13


/* Caller must ensure that we know tc_idx is valid and there's
   available chunks to remove.  */
static __always_inline void *
tcache_get (size_t tc_idx)
{
  tcache_entry *e = tcache->entries[tc_idx];
  assert (tc_idx < TCACHE_MAX_BINS);
  assert (tcache->counts[tc_idx] > 0);
  tcache->entries[tc_idx] = e->next;
  --(tcache->counts[tc_idx]);
  e->key = NULL;
  return (void *) e;
}

大概意思是，拿tcache的第tc_idx个作为输出，然后将原本第tc_idx的entry指向当前需要输出的entry的下一个，这意味着tcache里面不同的entry也可能指向同一个，然后再把对应的第tc_idx的count--。

再看一下和get对应的put操作：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14


/* Caller must ensure that we know tc_idx is valid and there's room
   for more chunks.  */
static __always_inline void
tcache_put (mchunkptr chunk, size_t tc_idx)
{
  tcache_entry *e = (tcache_entry *) chunk2mem (chunk);
  assert (tc_idx < TCACHE_MAX_BINS);
  /* Mark this chunk as "in the tcache" so the test in _int_free will
     detect a double free.  */
  e->key = tcache;
  e->next = tcache->entries[tc_idx];
  tcache->entries[tc_idx] = e;
  ++(tcache->counts[tc_idx]);
}

恩！可以和get对应起来了，get的时候第tc_idx的entry被指向了下一个，在put的时候就可以记住下一个的位置，然后put的时候将tc_idx指回去就行了，所以链表又恢复到了原状。

这里有个疑问：entry的状态是怎么记录的？比如get的时候发送一个entry出去，那么free的时候怎么记得这个buffer的大小/来源等等之类的呢？可以想到一些类比结构，比如：

task_struct和thread_info的内存关系（这里）
class中type_info和类的内存关系（这里）

实际上，tcache entry的“头上”还有一块内存空间，在那里记录了malloc内存的一些状态信息。在这里可以看到详细的内存分布。

以上，我们大概了解到了malloc以及tcache：

malloc可以从tcache中直接拿到内存
tcache可以用作缓存内存，也就是内存可以不立即交还给系统
tcache相当于是一个内存池，并且由每个线程维护
malloc的内存比实际需要的内存大

以上默认情况是可以从tcache中获取内存，如果不可以或者里面没有内存呢？

_int_malloc

如果上述条件不满足，__libc_malloc后续的主要逻辑就是调用_int_malloc函数，如下：

1
2


arena_get (ar_ptr, bytes);
victim = _int_malloc (ar_ptr, bytes);

第一部分

第一部分可能调用sysmalloc，其中会使用mmap或者brk分配内存。只有av为空的时候才可能进入这一段，av什么时候可能为空？在线程第一次调用malloc之前，av可能为空。这时候通过系统调用申请一块比较大的内存，然后再执行某种分配，填充到av，所以此后，部分内存可以直接通过av获取，而不需要频繁执行系统调用了。（sysmalloc不是系统调用）

1
2
3
4
5
6
7
8
9


/* There are no usable arenas.  Fall back to sysmalloc to get a chunk from
    mmap.  */
if (__glibc_unlikely (av == NULL))
{
    void *p = sysmalloc (nb, av);
    if (p != NULL)
    alloc_perturb (p, bytes);
    return p;
}

第二部分

第二部分代码很多，只看外部的一点：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11


if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ()))
{
    idx = fastbin_index (nb);
    mfastbinptr *fb = &fastbin (av, idx);
    //...

    void *p = chunk2mem (victim);
    alloc_perturb (p, bytes);
    return p;
    //...
}

只有在申请的内存小于get_max_fast ()的时候才可能进入。类似的，我们可以看到有fastbin这样的结构，和上文中看到的tcache一样，会有专门的表维护fastbin，并且其中储存的是较小的内存。

从fastbin这中拿到内存后，通过chunk2mem转换成用户可用的内存（实际上就是往高字节取）。

第三部分

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34


/*
    If a small request, check regular bin.  Since these "smallbins"
    hold one size each, no searching within bins is necessary.
    (For a large request, we need to wait until unsorted chunks are
    processed to find best fit. But for small ones, fits are exact
    anyway, so we can check now, which is faster.)
*/
if (in_smallbin_range (nb))
{
    idx = smallbin_index (nb);
    bin = bin_at (av, idx);
    //......

    void *p = chunk2mem (victim);
    alloc_perturb (p, bytes);
    return p;
    //......
}
/*
    If this is a large request, consolidate fastbins before continuing.
    While it might look excessive to kill all fastbins before
    even seeing if there is space available, this avoids
    fragmentation problems normally associated with fastbins.
    Also, in practice, programs tend to have runs of either small or
    large requests, but less often mixtures, so consolidation is not
    invoked all that often in most programs. And the programs that
    it is called frequently in otherwise tend to fragment.
*/
else
{
    idx = largebin_index (nb);
    if (atomic_load_relaxed (&av->have_fastchunks))
    malloc_consolidate (av);
}

同理，如果fastbin的大小不满足要求，那么就在smallbin里面去找。

如果smallbin还不满足要求，那么认为需要一个large的内存，这时候会先将fastbin中的小内存合并。其目的是减少内存碎片化（brk内存容易产生碎片，mmap内存不产生碎片，一般情况下只有大内存才会使用mmap申请，比如128KB以上）。

合并到哪里呢？看malloc_consolidate大概是会合并到一个叫unsorted_bin的bin里面去。

第四部分

源码

（看不懂了！！！）

遗留问题是（TODO）：

tcache
fastbin
smallbin
unsorted_bin
以上如何管理？
以上什么关系？
bin是怎么初始化的？
bin是怎么扩充的，可以扩充吗？

总结

尽管没全部看懂，不过也了解malloc的一些事情：

malloc可以尽可能减少通过系统调用分配内存
malloc可以尽可能减少内存碎片
malloc在用户态管理内存（区别于mmap的rb_tree）
malloc申请内存可能通过“内存池”直接返回，也有可能会通过系统调用返回，一般取决于需要的内存大小
malloc可能存在内存合并的过程
malloc管理的内存数量以及大小是有限的（通过idx的计算方式可以推断出）
malloc管理的内存是按线程区分的，所以多线程情况下不存在阻塞
malloc分配虚拟内存的时候，实际的会比需求的更多（物理内存也会更多，因为需要有存放状态的内存，这时候是已经缺页中断了）
free回收内存不一定直接交还给系统，可能会交由tcache或者bin管理，这是用户态的