关于fuse用户态文件系统的文章有很多,比如http://my.debugman.net/program/fuse-180.html,就写得很全面。但关于fuse用户态、内核态通信的文章还比较少,我现在发现的一篇是http://blog.chinaunix.net/uid-20687780-id-313603.html,主要讲解了用户态、内核态的通信协议。
这里主要分析一下fuse的内核态用户态通信机制。fuse的主要运行流程如下图所示:
当用户态程序执行了POSIX的文件系统操作,经过glibc,变换为系统调用传递给vfs,vfs再将其传给FUSE的内核模块,FUSE的内核模块根据系统调用的类型,将请求发送到用户态的FUSE进程,并等待用户态进程的应答。FUSE内核模块再收到应答后,将其发送给vfs,把最终运行结果呈现到用户态程序。
那FUSE是如何让用户态与内核态通信的呢?这个在源代码中可以看得比较清楚。
首先在,内核代码fs/fuse/dev.c中,
/* 为fuse定义一个misc设备 */通过调用fuse_dev_init函数,将会生成一个misc设备(类似字符设备,但主设备号为10,并且会在/dev/目录下根据设备名,自动生成设备文件)在/dev/fuse下。用户态代码在通过open这个设备文件,并且通过如下函数,注册向fuse内核态通信的函数:
static struct miscdevice fuse_miscdevice = {
.minor = FUSE_MINOR,
.name = "fuse", /* 生产的misc设备将会出现在/dev/fuse */
.fops = &fuse_dev_operations,
};
int __init fuse_dev_init(void)
{
int err = -ENOMEM;
fuse_req_cachep = kmem_cache_create("fuse_request",
sizeof(struct fuse_req),
0, 0, NULL);
if (!fuse_req_cachep)
goto out;
err = misc_register(&fuse_miscdevice); /* 注册成misc设备,misc设备的主设备号为10 */
if (err)
goto out_cache_clean;
return 0;
out_cache_clean:
kmem_cache_destroy(fuse_req_cachep);
out:
return err;
}
struct fuse_chan *fuse_kern_chan_new(int fd){ struct fuse_chan_ops op = { .receive = fuse_kern_chan_receive, .send = fuse_kern_chan_send, .destroy = fuse_kern_chan_destroy, }; size_t bufsize = getpagesize() + 0x1000; bufsize = bufsize < MIN_BUFSIZE ? MIN_BUFSIZE : bufsize; return fuse_chan_new(&op, fd, bufsize, NULL);}fuse_kern_chan_receive函数,通过res = read(fuse_chan_fd(ch), buf, size);从/dev/fuse中读取内核发来的情求,再通过fuse_kern_chan_send函数中的ssize_t res = writev(fuse_chan_fd(ch), iov, count);将数据发送到内核模块。
再回到内核模块,还是fs/fuse/dev.c文件中,FUSE通过为/dev/fuse设备文件注册以下操作回调来支持用户态的对其的读写操作:
const struct file_operations fuse_dev_operations = {其中,do_sync_read中,调用了ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos),同样do_sync_write函数中,也调用了ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos),所以他们不用单独实现。
.owner = THIS_MODULE,
.llseek = no_llseek, /* 不支持seek操作 */
.read = do_sync_read, /* 使用通用的同步读函数 */
.aio_read = fuse_dev_read, /* fuse为用户态读取提供的异步函数 */
.write = do_sync_write, /* 使用通用的同步写函授 */
.aio_write = fuse_dev_write, /* fuse为用户态读取提供的异步函授 */
.poll = fuse_dev_poll, /* 检查是否在一个文件上有操作发生,如果没有则睡眠,直到该文件上有操作发生*/
.release = fuse_dev_release, /* 用户态close该设备文件对应的fd */
.fasync = fuse_dev_fasync, /* 通过信号来启用或禁止I/O事件通告*/
};
在FUSE内核中,存在一个fuse_conn结构体,为用户态、内核态通信服务,其结构为:
/**
* A Fuse connection.
*
* This structure is created, when the filesystem is mounted, and is
* destroyed, when the client device is closed and the filesystem is
* unmounted.
*/
struct fuse_conn {
/** Lock protecting accessess to members of this structure */
spinlock_t lock;
/** Mutex protecting against directory alias creation */
struct mutex inst_mutex;
/** Refcount 结构体的引用计数*/
atomic_t count;
/** The user id for this mount 用户ID*/
uid_t user_id;
/** The group id for this mount 组ID*/
gid_t group_id;
/** The fuse mount flags for this mount 挂载参数*/
unsigned flags;
/** Maximum read size 最大读取字节数*/
unsigned max_read;
/** Maximum write size 最大写入字节数*/
unsigned max_write;
/** Readers of the connection are waiting on this 读取请求的等待队列*/
wait_queue_head_t waitq;
/** The list of pending requests 正在等待的队列*/
struct list_head pending;
/** The list of requests being processed 正在处理的队列*/
struct list_head processing;
/** The list of requests under I/O 正在进行IO操作的队列*/
struct list_head io;
/** The next unique kernel file handle */
u64 khctr;
/** rbtree of fuse_files waiting for poll events indexed by ph */
struct rb_root polled_files;
/** Maximum number of outstanding background requests 最大后台请求数*/
unsigned max_background;
/** Number of background requests at which congestion starts */
unsigned congestion_threshold;
/** Number of requests currently in the background 后台请求数*/
unsigned num_background;
/** Number of background requests currently queued for userspace 正在执行的后台请求数*/
unsigned active_background;
/** The list of background requests set aside for later queuing */
struct list_head bg_queue;
/** Pending interrupts 中断请求队列*/
struct list_head interrupts;
/** Flag indicating if connection is blocked. This will be
the case before the INIT reply is received, and if there
are too many outstading backgrounds requests 阻塞标志*/
int blocked;
/** waitq for blocked connection 阻塞等待队列*/
wait_queue_head_t blocked_waitq;
/** waitq for reserved requests 等待服务的队列*/
wait_queue_head_t reserved_req_waitq;
/** The next unique request id */
u64 reqctr;
/** Connection established, cleared on umount, connection
abort and device release 连接标志*/
unsigned connected;
/** Connection failed (version mismatch). Cannot race with
setting other bitfields since it is only set once in INIT
reply, before any other request, and never cleared */
unsigned conn_error:1;
/** Connection successful. Only set in INIT */
unsigned conn_init:1;
/** Do readpages asynchronously? Only set in INIT */
unsigned async_read:1;
/** Do not send separate SETATTR request before open(O_TRUNC) */
unsigned atomic_o_trunc:1;
/** Filesystem supports NFS exporting. Only set in INIT */
unsigned export_support:1;
/** Set if bdi is valid */
unsigned bdi_initialized:1;
/*
* The following bitfields are only for optimization purposes
* and hence races in setting them will not cause malfunction
*/
/** Is fsync not implemented by fs? */
unsigned no_fsync:1;
/** Is fsyncdir not implemented by fs? */
unsigned no_fsyncdir:1;
/** Is flush not implemented by fs? */
unsigned no_flush:1;
/** Is setxattr not implemented by fs? */
unsigned no_setxattr:1;
/** Is getxattr not implemented by fs? */
unsigned no_getxattr:1;
/** Is listxattr not implemented by fs? */
unsigned no_listxattr:1;
/** Is removexattr not implemented by fs? */
unsigned no_removexattr:1;
/** Are file locking primitives not implemented by fs? */
unsigned no_lock:1;
/** Is access not implemented by fs? */
unsigned no_access:1;
/** Is create not implemented by fs? */
unsigned no_create:1;
/** Is interrupt not implemented by fs? */
unsigned no_interrupt:1;
/** Is bmap not implemented by fs? */
unsigned no_bmap:1;
/** Is poll not implemented by fs? */
unsigned no_poll:1;
/** Do multi-page cached writes */
unsigned big_writes:1;
/** Don't apply umask to creation modes */
unsigned dont_mask:1;
/** The number of requests waiting for completion */
atomic_t num_waiting;
/** Negotiated minor version */
unsigned minor;
/** Backing dev info */
struct backing_dev_info bdi;
/** Entry on the fuse_conn_list */
struct list_head entry;
/** Device ID from super block 超级块的设备id*/
dev_t dev;
/** Dentries in the control filesystem */
struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES];
/** number of dentries used in the above array */
int ctl_ndents;
/** O_ASYNC requests */
struct fasync_struct *fasync;
/** Key for lock owner ID scrambling */
u32 scramble_key[4];
/** Reserved request for the DESTROY message */
struct fuse_req *destroy_req;
/** Version counter for attribute changes 文件属性的版本*/
u64 attr_version;
/** Called on final put */
void (*release)(struct fuse_conn *);
/** Super block for this connection. */
struct super_block *sb;
/** Read/write semaphore to hold when accessing sb. 访问超级块的信号量*/
struct rw_semaphore killsb;
};
fuse_conn结构体的指针将会保存在file->private_data中,每次内核态向用户态发送情求时都会用到fuse_conn结构体。在fuse_dev_read函数的处理流程主要入下:
static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct fuse_in *in; /* 用来表示用户态读入的内核 */
//省略变量定义
struct fuse_conn *fc = fuse_get_conn(file); /* 获得fuse_conn结构体的指针 */
if (!fc)
return -EPERM;
restart:
spin_lock(&fc->lock);
err = -EAGAIN;
if ((file->f_flags & O_NONBLOCK) && fc->connected &&
!request_pending(fc)) //如果是非阻塞方式,则判断队列中有无等待处理请求,无请求则直接返回
goto err_unlock;
request_wait(fc); //阻塞等待内核态的请求到了
......
if (!list_empty(&fc->interrupts)) { //判断是否有中断请求需要发送,有则先发中断请求
req = list_entry(fc->interrupts.next, struct fuse_req,
intr_entry);
return fuse_read_interrupt(fc, req, iov, nr_segs);
}
req = list_entry(fc->pending.next, struct fuse_req, list); //从pending队列中获得下一个要发生的请求
req->state = FUSE_REQ_READING;
list_move(&req->list, &fc->io); //将请求移动到正在进行IO的队列中
in = &req->in;
reqsize = in->h.len;
/* If request is too large, reply with an error and restart the read */
........
spin_unlock(&fc->lock);
fuse_copy_init(&cs, fc, 1, req, iov, nr_segs); //为将请求拷贝到用户态做准备
err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); //将请求的包头拷贝到用户态
if (!err)
err = fuse_copy_args(&cs, in->numargs, in->argpages,
(struct fuse_arg *) in->args, 0); //将请求的包体拷贝到用户态,如果包中有多个参数,则需要循环将参数拷完
fuse_copy_finish(&cs); //完成拷贝,释放内存
spin_lock(&fc->lock);
req->locked = 0;
//对发送过程进行错误判断,省略
....
if (!req->isreply) //如果没有返回值,则结束请求
request_end(fc, req);
else {
req->state = FUSE_REQ_SENT; //如果这个请求需要用户态返回执行结果
list_move_tail(&req->list, &fc->processing); //则将请求转到processing队列中,交给fuse_dev_write来处理
if (req->interrupted)
queue_interrupt(fc, req);
spin_unlock(&fc->lock);
}
return reqsize;
err_unlock:
spin_unlock(&fc->lock);
return err;
}
其中fuse_in结构体如下所示:
/** The request input */
struct fuse_in {
/** The request header 指令的头部*/
struct fuse_in_header h;
/** True if the data for the last argument is in req->pages */
unsigned argpages:1;
/** Number of arguments 这条指令中包含的参数个数*/
unsigned numargs;
/** Array of arguments 参数的数组*/
struct fuse_in_arg args[3];
};
此结构体中,包含的另外两个结构体
struct fuse_in_header {
__u32 len; //包的长度
__u32 opcode; //操作码,用来表示操作类型
__u64 unique; //此包的唯一编号
__u64 nodeid; //表示操作文件节点的id,类似ino
__u32 uid;
__u32 gid;
__u32 pid;
__u32 padding; //是否处于挂起状态 ???
};
/** One input argument of a request */
struct fuse_in_arg {
unsigned size; //参数的长度
const void *value; //参数的指针
};