我做的主要是ssd驱动,ssd驱动通过FTL转换成对一个磁盘的操作,也就相当于一个磁盘的块设备驱动。块设备驱动程序主要通过传输固定大小的随机数据来访问设备。
注册块设备
和字符设备驱动一样,都必须先到内核注册下,才能操作设备;在头文件<linux/fs.h>中
注册函数: int register_blkdev(unsigned int major, const char *name);
参数:major 是主设备号,name是设备的名称,在/proc/devices中显示。
如果major传递的是0,则由内核来分配一个主设备号给设备,并且返回给调用者。如果返回值为负数,则表示函数注册失败;
注:其实如果major不为0的话就相当于字符设备中的静态设备号申请,如果为0,则相当于动态设备申请了;
销毁函数:int unregister_blkdev(unsigned int major, const char *name);
参数一定要和注册函数的参数匹配,否则出错;
gendisk结构体
结构体
结构体如下:
struct gendisk { /* major, first_minor and minors are input parameters only, * don't use directly. Use disk_devt() and disk_max_parts(). */ int major; /* major number of driver */ int first_minor; int minors; /* maximum number of minors, =1 for * disks that can't be partitioned. */ char disk_name[DISK_NAME_LEN]; /* name of major driver */ char *(*devnode)(struct gendisk *gd, mode_t *mode); unsigned int events; /* supported events */ unsigned int async_events; /* async events, subset of all */ /* Array of pointers to partitions indexed by partno. * Protected with matching bdev lock but stat and other * non-critical accesses use RCU. Always access through * helpers. */ //整个块设备的分区信息都包含在里面,其核心结构是一个struct hd_struct的指针数组,每一项都指向一个描述分区的hd_struct结构 struct disk_part_tbl __rcu *part_tbl; struct hd_struct part0;// 第一个分区的信息,如果没有分区则指向整个设备 const struct block_device_operations *fops;//操作函数指针集合 struct request_queue *queue;//请求队列 void *private_data; int flags; struct device *driverfs_dev; // FIXME: remove struct kobject *slave_dir; struct timer_rand_state *random; atomic_t sync_io; /* RAID */ struct disk_events *ev; #ifdef CONFIG_BLK_DEV_INTEGRITY struct blk_integrity *integrity; #endif int node_id; };
int minors 次设备号,一个驱动器至少使用一个次设备号,如果驱动器是可分区的,则为每一个分区分配一个次设备号;
char disk_name[32] 设置磁盘设备的名称,该名字在/proc/partitions和sysfs中显示;
struct request_queue *queue 请求队列,为设备管理I/O请求;
sector_t capacity 以512为一个扇区,该驱动器可以包含的扇区数,一般通过set_capacity设置;
set_capacity(dev->gd, nsectors*(hardsect_size/KERNEL_SECTOR_SIZE));KERNEL_SECTOR_SIZE是一个常量,使用该常量进行内核的512字节扇区到实际使用扇区大小的转换。
分区表
//分区列表
struct disk_part_tbl {
struct rcu_head rcu_head;
int len;
struct hd_struct __rcu *last_lookup;
struct hd_struct __rcu *part[];
};
分区结构体
// 分区信息 struct hd_struct { sector_t start_sect;// 当前分区的起始扇区 sector_t nr_sects;// 分区的大小,多少个扇区 sector_t alignment_offset; unsigned int discard_alignment; struct device __dev;// 一个分区对应一个设备 struct kobject *holder_dir; int policy, partno;// partno分区编号 struct partition_meta_info *info; #ifdef CONFIG_FAIL_MAKE_REQUEST int make_it_fail; #endif unsigned long stamp; atomic_t in_flight[2]; #ifdef CONFIG_SMP struct disk_stats __percpu *dkstats; #else struct disk_stats dkstats; #endif atomic_t ref; struct rcu_head rcu_head; };
操作函数指针
在gendisk结构体中有个fops成员,这个成员是对快设备的操作函数指针的集合,如果熟悉字符设备的,应该不难理解;这个结构体是block_device_operations(相当于字符设备的file_operations结构体);
struct block_device_operations { int (*open) (struct block_device *, fmode_t); int (*release) (struct gendisk *, fmode_t); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *); unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); /* ->media_changed() is DEPRECATED, use ->check_events() instead */ int (*media_changed) (struct gendisk *); void (*unlock_native_capacity) (struct gendisk *); int (*revalidate_disk) (struct gendisk *); int (*getgeo)(struct block_device *, struct hd_geometry *); /* this callback is with swap_lock and sometimes page table lock held */ void (*swap_slot_free_notify) (struct block_device *, unsigned long); struct module *owner; };这里有很多函数指针,但真正使用到的没有多少个。open/release 一般都会使用到(设备的开、关);media_changed/revalidate_disk 检查是否更换了驱动器内的介质/ 如果更换了,则做出响应,告诉驱动程序,作出相应的响应。getgeo函数是获取设置设备参数;
gendisk操作函数
gendisk结构是一个动态分配的结构,它需要内涵的一些特殊处理来进行初始化,驱动程序不能自己动态的分配该结构体;
struct gendisk *alloc_disk(int minors);
参数:minors是该磁盘使用次设备号的数目,以后不能再修改的;
卸载磁盘:
void del_gendisk(struct gendisk *gd);
gendisk是一个引用计数结构,gen_disk和put_disk函数负责处理引用计数,但驱动不能直接使用这两个函数;
分配了一个gendisk结构并不能使磁盘对系统可用,必须初始化结构体并调用add_disk:
void add_disk(struct gendisk *gd);
一旦调用add_disk,磁盘设备将被“激活”,并随时调用它的提供的方法。当第一次调用这些方法时,add_disk函数可能还没有返回;因为内核可能会调用gendisk结构体中的分区表;
block_device结构
block_device结构代表了内核中的一个块设备。它可以表示整个磁盘或一个特定的分区。当这个结构代表一个分区时,它的bd_contains成员指向包含这个分区的设备,bd_part成员指向设备的分区结构。当这个结构代表一个块设备时,bd_disk成员指向设备的gendisk结构。这个结构体在ldd3中没有提到;
// 设备(分区)结构体 struct block_device { dev_t bd_dev; /* not a kdev_t - it's a search key */ int bd_openers;//记录有多少进程打开该设备 struct inode * bd_inode; /* will die */ struct super_block * bd_super; struct mutex bd_mutex; /* open/close mutex */ struct list_head bd_inodes; void * bd_claiming; void * bd_holder; int bd_holders; bool bd_write_holder; #ifdef CONFIG_SYSFS struct list_head bd_holder_disks; #endif struct block_device * bd_contains;// 如果是分区,则指向主设备块;否则指向自己 unsigned bd_block_size; struct hd_struct * bd_part;// 指向对应的分区信息 /* number of times partitions within this device have been opened. */ unsigned bd_part_count;// 分区打开的次数,重新扫描则为0 int bd_invalidated; struct gendisk * bd_disk;// 指向gendisk struct list_head bd_list; /* * Private data. You must have bd_claim'ed the block_device * to use this. NOTE: bd_claim allows an owner to claim * the same device multiple times, the owner must take special * care to not mess up bd_private for that case. */ unsigned long bd_private; /* The counter of freeze processes */ int bd_fsfreeze_count; /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; };
request_queue结构体
gendisk结构体中有个成员queue,这个就是request_queue结构体的实例变量。每一块设备都会有一个队列,当需要对设备操作时,把请求放在队列中。因为对块设备的操作 I/O访问不能及时调用完成,I/O操作比较慢,所以把所有的请求放在队列中,等到合适的时候再处理这些请求;
struct request_queue { /* * Together with queue_head for cacheline sharing */ struct list_head queue_head;//待处理请求的链表 struct request *last_merge;//指向队列中首先可能合并的描述符 struct elevator_queue *elevator;////指向elevator对象的指针(电梯算法) /* * the queue request freelist, one for reads and one for writes */ struct request_list rq;//为分配请求描述符所使用的数据结构 request_fn_proc *request_fn;//实现驱动程序的策略例程入口点的方法,策略例程方法来处理请求队列中的下一个请求 make_request_fn *make_request_fn;//将一个新请求插入请求队列时调用的方法 prep_rq_fn *prep_rq_fn; //该方法把这个处理请求的命令发送给硬件设备 unprep_rq_fn *unprep_rq_fn;//去掉块设备的方法 merge_bvec_fn *merge_bvec_fn; //当增加一个新段时,该方法返回可插人到某个已存在的bio结构中的字节数(通常未定义) softirq_done_fn *softirq_done_fn; rq_timed_out_fn *rq_timed_out_fn; dma_drain_needed_fn *dma_drain_needed; lld_busy_fn *lld_busy_fn; /* * Dispatch queue sorting */ sector_t end_sector; struct request *boundary_rq; /* * Delayed queue handling */ struct delayed_work delay_work; struct backing_dev_info backing_dev_info; /* * The queue owner gets to use this for whatever they like. * ll_rw_blk doesn't touch it. */ void *queuedata;//指向块设备驱动程序的私有数据的指针 /* * various queue flags, see QUEUE_* below */ unsigned long queue_flags;//描述请求队列状态的标志 /* * queue needs bounce pages for pages above this limit */ gfp_t bounce_gfp;//回弹缓冲区的内存分配标志 /* * protects queue structures from reentrancy. ->__queue_lock should * _never_ be used directly, it is queue private. always use * ->queue_lock. */ spinlock_t __queue_lock; spinlock_t *queue_lock; /* * queue kobject */ struct kobject kobj; /* * queue settings */ unsigned long nr_requests; /* Max # of requests */ unsigned int nr_congestion_on;//如果待处理请求数超出了该闭值,则认为该队列是拥挤的 unsigned int nr_congestion_off;//如果待处理请求数在这个闭值的范围内,则认为该队列是不拥挤的 unsigned int nr_batching;//即使队列已满,仍可以由特殊进程“batcher”提交的待处理请求的最大值(通常为32) unsigned int dma_drain_size; void *dma_drain_buffer; unsigned int dma_pad_mask; unsigned int dma_alignment; struct blk_queue_tag *queue_tags; struct list_head tag_busy_list; unsigned int nr_sorted; unsigned int in_flight[2]; unsigned int rq_timeout; struct timer_list timeout; struct list_head timeout_list; struct queue_limits limits; /* * sg stuff */ unsigned int sg_timeout; unsigned int sg_reserved_size; int node; #ifdef CONFIG_BLK_DEV_IO_TRACE struct blk_trace *blk_trace; #endif /* * for flush operations */ unsigned int flush_flags; unsigned int flush_not_queueable:1; unsigned int flush_queue_delayed:1; unsigned int flush_pending_idx:1; unsigned int flush_running_idx:1; unsigned long flush_pending_since; struct list_head flush_queue[2]; struct list_head flush_data_in_flight; struct request flush_rq; struct mutex sysfs_lock; #if defined(CONFIG_BLK_DEV_BSG) bsg_job_fn *bsg_job_fn; int bsg_job_size; struct bsg_class_device bsg_dev; #endif #ifdef CONFIG_BLK_DEV_THROTTLING /* Throttle data */ struct throtl_data *td; #endif };
操作函数
分配队列函数:
dev->queue = blk_init_queue(request_func, &dev->lock);
request_func函数指针是请求函数,负责执行块设备的读、写请求。
内核认为每个磁盘都是由512字节大小的扇区所组成的线性数组;
所有操作的第一步就是通知内核设备所支持的扇区大小,硬件扇区大小作为一个参数放在请求队列中,而不是放在gendisk结构中;
blk_queue_hardsect_size(dev->queue, hardsect_size);// 调用了该函数后,内核对设备使用设定的硬件扇区大小。
request结构体
request结构体就是请求操作块设备的请求结构体,该结构体被放到request_queue队列中,等到合适的时候再处理。
struct request { struct list_head queuelist; //请求结构体队列链表 struct call_single_data csd; struct request_queue *q;//所在的队列 unsigned int cmd_flags; enum rq_cmd_type_bits cmd_type;// 命令类型 unsigned long atomic_flags; int cpu; /* the following two fields are internal, NEVER access directly */ // 下面两个字段从不直接访问 unsigned int __data_len; /* total data len */ sector_t __sector; /* sector cursor */ struct bio *bio;//请求的bio结构链表,不能直接访问,要使用 rq_for_each_bio来遍历 struct bio *biotail;//应该是链表的尾部 struct hlist_node hash; /* merge hash */ /* * The rb_node is only used inside the io scheduler, requests * are pruned when moved to the dispatch queue. So let the * completion_data share space with the rb_node. */ union { struct rb_node rb_node; /* sort/lookup */ void *completion_data; }; /* * Three pointers are available for the IO schedulers, if they need * more they have to dynamically allocate it. Flush requests are * never put on the IO scheduler. So let the flush fields share * space with the three elevator_private pointers. */ union { void *elevator_private[3]; struct { unsigned int seq; struct list_head list; rq_end_io_fn *saved_end_io; } flush; }; struct gendisk *rq_disk; struct hd_struct *part; unsigned long start_time; #ifdef CONFIG_BLK_CGROUP unsigned long long start_time_ns; unsigned long long io_start_time_ns; /* when passed to hardware */ #endif /* Number of scatter-gather DMA addr+len pairs after * physical address coalescing is performed. */ unsigned short nr_phys_segments; #if defined(CONFIG_BLK_DEV_INTEGRITY) unsigned short nr_integrity_segments; #endif unsigned short ioprio; int ref_count; void *special; /* opaque pointer available for LLD use */ char *buffer; /* kaddr of the current segment if available */ int tag; int errors; /* * when request is used as a packet command carrier */ unsigned char __cmd[BLK_MAX_CDB]; unsigned char *cmd; unsigned short cmd_len; unsigned int extra_len; /* length of alignment and padding */ unsigned int sense_len; unsigned int resid_len; /* residual count */ void *sense; unsigned long deadline; struct list_head timeout_list; unsigned int timeout; int retries; /* * completion callback. */ rq_end_io_fn *end_io; void *end_io_data; /* for bidi */ struct request *next_rq; };
操作函数
每个块设备驱动程序的核心都是它的请求函数。
块设备驱动程序的request函数:
void request(request_queue_t *queue);
当内核需要驱动程序处理读取、写入以及其他设备的操作时,就会调用该函数;在其返回前,request不需要完成队列中所有的请求,事实上,对大都数设备来说,它可能没有完成任何的请求。但是它必须启动对请求的响应,并且保证所有的请求最终都会被驱动程序所处理。也就是说,当有请求来了,调用该函数,把请求结构体放到队列中,但是其实是没有执行的,仅仅是放在队列中告诉驱动有时间就去完成这个请求,而所谓的响应就是表示成功放到了队列中。
该函数和设备的队列绑定的,在设备队列生成的那一刻就绑定了该请求函数: dev->queue = blk_init_queue(request, &dev->lock);
BIO结构体
bio结构体其实是request结构体的实际数据,一个request结构体中包含一个或者多个bio结构体,在底层实际是按bio来对设备进行操作的。该结构被传递给I/O代码,代码会把它合并到一个已经存在的request结构体中,或者需要的话会再创建一个新的request结构体;bio结构体包含了驱动程序执行请求的全部信息。
struct bio { //该bio结构所要传输的第一个(512字节)扇区:磁盘的位置 sector_t bi_sector; /* device address in 512 byte sectors */ struct bio *bi_next; /* request queue link *///请求queue链表==指向下一个bio结构体 struct block_device *bi_bdev; // 相关设备 unsigned long bi_flags; /* status, command, etc */ unsigned long bi_rw; /* bottom bits READ/WRITE, * top bits priority */// 低位为读/写,高位为优先级 unsigned short bi_vcnt; /* how many bio_vec's *///多少个bio_vec数组 unsigned short bi_idx; /* current index into bvl_vec *///现在指向哪个数组元素,相当于偏移量 /* Number of segments in this BIO after * physical address coalescing is performed. */ unsigned int bi_phys_segments;// 合并后的segments数,bio中包含的物理段数目 //剩余的I/O计数,也就是以字节为单位需要传送的数据大小 unsigned int bi_size; /* residual I/O count */ /* * To keep track of the max segment size, we account for the * sizes of the first and last mergeable segments in this bio. */ unsigned int bi_seg_front_size; unsigned int bi_seg_back_size; unsigned int bi_max_vecs; /* max bvl_vecs we can hold */ unsigned int bi_comp_cpu; /* completion CPU */ atomic_t bi_cnt; /* pin count */ struct bio_vec *bi_io_vec; /* the actual vec list */// 实际的数组链表 bio_end_io_t *bi_end_io; void *bi_private; #if defined(CONFIG_BLK_DEV_INTEGRITY) struct bio_integrity_payload *bi_integrity; /* data integrity */ #endif bio_destructor_t *bi_destructor; /* destructor */ /* * We can inline a number of vecs at the end of the bio, to avoid * double allocations for a small number of bio_vecs. This member * MUST obviously be kept at the very end of the bio. */ struct bio_vec bi_inline_vecs[0]; };
sector_t bi_sector 该bio结构所要传输的第一个扇区;
unsigned int bi_size 以字节为单位所需要传输的数据大小,通常要bio_sectors(bio)获得每个扇区的大小
unsigned long bi_flags bio中一系列的标志位,如果写请求,最低有效位将被设置,通过 bio_data_dir(bio)查看,不能直接查看;
unsigned short bio_phys_segments 和 unsigned short bio_hw_segments:当DMA映射完成时,它们分别表示BIO中包含的物理段数目和硬件所能操作的段数目;
bio结构体的核心是一个名为bi_io_vec的数组,结构体名为:
struct bio_vec { struct page *bv_page;//指向整个缓冲区所驻留的物理页面 unsigned int bv_len;//这个缓冲区以字节为单位的大小 unsigned int bv_offset;//缓冲区所驻留的页中以字节为单位的偏移量 };
转载地址:http://blog.csdn.net/yuzhihui_no1/article/details/46763817