fuse用户态、内核态通信机制分析

时间:2021-11-06 08:51:59

关于fuse用户态文件系统的文章有很多,比如http://my.debugman.net/program/fuse-180.html,就写得很全面。但关于fuse用户态、内核态通信的文章还比较少,我现在发现的一篇是http://blog.chinaunix.net/uid-20687780-id-313603.html,主要讲解了用户态、内核态的通信协议。

这里主要分析一下fuse的内核态用户态通信机制。fuse的主要运行流程如下图所示:

fuse用户态、内核态通信机制分析

当用户态程序执行了POSIX的文件系统操作,经过glibc,变换为系统调用传递给vfs,vfs再将其传给FUSE的内核模块,FUSE的内核模块根据系统调用的类型,将请求发送到用户态的FUSE进程,并等待用户态进程的应答。FUSE内核模块再收到应答后,将其发送给vfs,把最终运行结果呈现到用户态程序。

那FUSE是如何让用户态与内核态通信的呢?这个在源代码中可以看得比较清楚。

首先在,内核代码fs/fuse/dev.c中,

/* 为fuse定义一个misc设备 */
static struct miscdevice fuse_miscdevice = {
    .minor = FUSE_MINOR,
    .name  = "fuse",                  /* 生产的misc设备将会出现在/dev/fuse */
    .fops = &fuse_dev_operations,
};

int __init fuse_dev_init(void)
{
    int err = -ENOMEM;
    fuse_req_cachep = kmem_cache_create("fuse_request",
                        sizeof(struct fuse_req),
                        0, 0, NULL);
    if (!fuse_req_cachep)
        goto out;

    err = misc_register(&fuse_miscdevice);   /* 注册成misc设备,misc设备的主设备号为10 */
    if (err)
        goto out_cache_clean;

    return 0;

 out_cache_clean:
    kmem_cache_destroy(fuse_req_cachep);
 out:
    return err;
}
通过调用fuse_dev_init函数,将会生成一个misc设备(类似字符设备,但主设备号为10,并且会在/dev/目录下根据设备名,自动生成设备文件)在/dev/fuse下。用户态代码在通过open这个设备文件,并且通过如下函数,注册向fuse内核态通信的函数:
struct fuse_chan *fuse_kern_chan_new(int fd){	struct fuse_chan_ops op = {		.receive = fuse_kern_chan_receive,		.send = fuse_kern_chan_send,		.destroy = fuse_kern_chan_destroy,	};	size_t bufsize = getpagesize() + 0x1000;	bufsize = bufsize < MIN_BUFSIZE ? MIN_BUFSIZE : bufsize;	return fuse_chan_new(&op, fd, bufsize, NULL);}
fuse_kern_chan_receive函数,通过res = read(fuse_chan_fd(ch), buf, size);从/dev/fuse中读取内核发来的情求,再通过fuse_kern_chan_send函数中的ssize_t res = writev(fuse_chan_fd(ch), iov, count);将数据发送到内核模块。

再回到内核模块,还是fs/fuse/dev.c文件中,FUSE通过为/dev/fuse设备文件注册以下操作回调来支持用户态的对其的读写操作:

const struct file_operations fuse_dev_operations = {
.owner = THIS_MODULE,
.llseek = no_llseek, /* 不支持seek操作 */
.read = do_sync_read, /* 使用通用的同步读函数 */
.aio_read = fuse_dev_read, /* fuse为用户态读取提供的异步函数 */
.write = do_sync_write, /* 使用通用的同步写函授 */
.aio_write = fuse_dev_write, /* fuse为用户态读取提供的异步函授 */
.poll = fuse_dev_poll, /* 检查是否在一个文件上有操作发生,如果没有则睡眠,直到该文件上有操作发生*/
.release = fuse_dev_release, /* 用户态close该设备文件对应的fd */
.fasync = fuse_dev_fasync, /* 通过信号来启用或禁止I/O事件通告*/
};
其中,do_sync_read中,调用了ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos),同样do_sync_write函数中,也调用了ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos),所以他们不用单独实现。

在FUSE内核中,存在一个fuse_conn结构体,为用户态、内核态通信服务,其结构为:

/**
* A Fuse connection.
*
* This structure is created, when the filesystem is mounted, and is
* destroyed, when the client device is closed and the filesystem is
* unmounted.
*/
struct fuse_conn {
/** Lock protecting accessess to members of this structure */
spinlock_t lock;

/** Mutex protecting against directory alias creation */
struct mutex inst_mutex;

/** Refcount 结构体的引用计数*/
atomic_t count;

/** The user id for this mount 用户ID*/
uid_t user_id;

/** The group id for this mount 组ID*/
gid_t group_id;

/** The fuse mount flags for this mount 挂载参数*/
unsigned flags;

/** Maximum read size 最大读取字节数*/
unsigned max_read;

/** Maximum write size 最大写入字节数*/
unsigned max_write;

/** Readers of the connection are waiting on this 读取请求的等待队列*/
wait_queue_head_t waitq;

/** The list of pending requests 正在等待的队列*/
struct list_head pending;

/** The list of requests being processed 正在处理的队列*/
struct list_head processing;

/** The list of requests under I/O 正在进行IO操作的队列*/
struct list_head io;

/** The next unique kernel file handle */
u64 khctr;

/** rbtree of fuse_files waiting for poll events indexed by ph */
struct rb_root polled_files;

/** Maximum number of outstanding background requests 最大后台请求数*/
unsigned max_background;

/** Number of background requests at which congestion starts */
unsigned congestion_threshold;

/** Number of requests currently in the background 后台请求数*/
unsigned num_background;

/** Number of background requests currently queued for userspace 正在执行的后台请求数*/
unsigned active_background;

/** The list of background requests set aside for later queuing */
struct list_head bg_queue;

/** Pending interrupts 中断请求队列*/
struct list_head interrupts;

/** Flag indicating if connection is blocked. This will be
the case before the INIT reply is received, and if there
are too many outstading backgrounds requests 阻塞标志*/
int blocked;

/** waitq for blocked connection 阻塞等待队列*/
wait_queue_head_t blocked_waitq;

/** waitq for reserved requests 等待服务的队列*/
wait_queue_head_t reserved_req_waitq;

/** The next unique request id */
u64 reqctr;

/** Connection established, cleared on umount, connection
abort and device release 连接标志*/
unsigned connected;

/** Connection failed (version mismatch). Cannot race with
setting other bitfields since it is only set once in INIT
reply, before any other request, and never cleared */
unsigned conn_error:1;

/** Connection successful. Only set in INIT */
unsigned conn_init:1;

/** Do readpages asynchronously? Only set in INIT */
unsigned async_read:1;

/** Do not send separate SETATTR request before open(O_TRUNC) */
unsigned atomic_o_trunc:1;

/** Filesystem supports NFS exporting. Only set in INIT */
unsigned export_support:1;

/** Set if bdi is valid */
unsigned bdi_initialized:1;

/*
* The following bitfields are only for optimization purposes
* and hence races in setting them will not cause malfunction
*/

/** Is fsync not implemented by fs? */
unsigned no_fsync:1;

/** Is fsyncdir not implemented by fs? */
unsigned no_fsyncdir:1;

/** Is flush not implemented by fs? */
unsigned no_flush:1;

/** Is setxattr not implemented by fs? */
unsigned no_setxattr:1;

/** Is getxattr not implemented by fs? */
unsigned no_getxattr:1;

/** Is listxattr not implemented by fs? */
unsigned no_listxattr:1;

/** Is removexattr not implemented by fs? */
unsigned no_removexattr:1;

/** Are file locking primitives not implemented by fs? */
unsigned no_lock:1;

/** Is access not implemented by fs? */
unsigned no_access:1;

/** Is create not implemented by fs? */
unsigned no_create:1;

/** Is interrupt not implemented by fs? */
unsigned no_interrupt:1;

/** Is bmap not implemented by fs? */
unsigned no_bmap:1;

/** Is poll not implemented by fs? */
unsigned no_poll:1;

/** Do multi-page cached writes */
unsigned big_writes:1;

/** Don't apply umask to creation modes */
unsigned dont_mask:1;

/** The number of requests waiting for completion */
atomic_t num_waiting;

/** Negotiated minor version */
unsigned minor;

/** Backing dev info */
struct backing_dev_info bdi;

/** Entry on the fuse_conn_list */
struct list_head entry;

/** Device ID from super block 超级块的设备id*/
dev_t dev;

/** Dentries in the control filesystem */
struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES];

/** number of dentries used in the above array */
int ctl_ndents;

/** O_ASYNC requests */
struct fasync_struct *fasync;

/** Key for lock owner ID scrambling */
u32 scramble_key[4];

/** Reserved request for the DESTROY message */
struct fuse_req *destroy_req;

/** Version counter for attribute changes 文件属性的版本*/
u64 attr_version;

/** Called on final put */
void (*release)(struct fuse_conn *);

/** Super block for this connection. */
struct super_block *sb;

/** Read/write semaphore to hold when accessing sb. 访问超级块的信号量*/
struct rw_semaphore killsb;
};


fuse_conn结构体的指针将会保存在file->private_data中,每次内核态向用户态发送情求时都会用到fuse_conn结构体。在fuse_dev_read函数的处理流程主要入下:

static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct fuse_in *in; /* 用来表示用户态读入的内核 */
//省略变量定义
struct fuse_conn *fc = fuse_get_conn(file); /* 获得fuse_conn结构体的指针 */
if (!fc)
return -EPERM;

restart:
spin_lock(&fc->lock);
err = -EAGAIN;
if ((file->f_flags & O_NONBLOCK) && fc->connected &&
!request_pending(fc)) //如果是非阻塞方式,则判断队列中有无等待处理请求,无请求则直接返回
goto err_unlock;

request_wait(fc); //阻塞等待内核态的请求到了
......
if (!list_empty(&fc->interrupts)) { //判断是否有中断请求需要发送,有则先发中断请求
req = list_entry(fc->interrupts.next, struct fuse_req,
intr_entry);
return fuse_read_interrupt(fc, req, iov, nr_segs);
}

req = list_entry(fc->pending.next, struct fuse_req, list); //从pending队列中获得下一个要发生的请求
req->state = FUSE_REQ_READING;
list_move(&req->list, &fc->io); //将请求移动到正在进行IO的队列中

in = &req->in;
reqsize = in->h.len;
/* If request is too large, reply with an error and restart the read */
........

spin_unlock(&fc->lock);
fuse_copy_init(&cs, fc, 1, req, iov, nr_segs); //为将请求拷贝到用户态做准备
err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); //将请求的包头拷贝到用户态
if (!err)
err = fuse_copy_args(&cs, in->numargs, in->argpages,
(struct fuse_arg *) in->args, 0); //将请求的包体拷贝到用户态,如果包中有多个参数,则需要循环将参数拷完
fuse_copy_finish(&cs); //完成拷贝,释放内存
spin_lock(&fc->lock);
req->locked = 0;
//对发送过程进行错误判断,省略
....
if (!req->isreply) //如果没有返回值,则结束请求
request_end(fc, req);
else {
req->state = FUSE_REQ_SENT; //如果这个请求需要用户态返回执行结果
list_move_tail(&req->list, &fc->processing); //则将请求转到processing队列中,交给fuse_dev_write来处理
if (req->interrupted)
queue_interrupt(fc, req);
spin_unlock(&fc->lock);
}
return reqsize;

err_unlock:
spin_unlock(&fc->lock);
return err;
}


 其中fuse_in结构体如下所示:

/** The request input */
struct fuse_in {
/** The request header 指令的头部*/
struct fuse_in_header h;

/** True if the data for the last argument is in req->pages */
unsigned argpages:1;

/** Number of arguments 这条指令中包含的参数个数*/
unsigned numargs;

/** Array of arguments 参数的数组*/
struct fuse_in_arg args[3];
};

此结构体中,包含的另外两个结构体

struct fuse_in_header {
__u32 len; //包的长度
__u32 opcode; //操作码,用来表示操作类型
__u64 unique; //此包的唯一编号
__u64 nodeid; //表示操作文件节点的id,类似ino
__u32 uid;
__u32 gid;
__u32 pid;
__u32 padding; //是否处于挂起状态 ???
};

/** One input argument of a request */
struct fuse_in_arg {
unsigned size; //参数的长度
const void *value; //参数的指针
};