linux块设备驱动之相关结构体

时间:2021-02-09 11:16:12

        我做的主要是ssd驱动,ssd驱动通过FTL转换成对一个磁盘的操作,也就相当于一个磁盘的块设备驱动。块设备驱动程序主要通过传输固定大小的随机数据来访问设备。


注册块设备

        和字符设备驱动一样,都必须先到内核注册下,才能操作设备;在头文件<linux/fs.h>中

        注册函数: int  register_blkdev(unsigned int major,  const char *name);

        参数:major 是主设备号,name是设备的名称,在/proc/devices中显示。

        如果major传递的是0,则由内核来分配一个主设备号给设备,并且返回给调用者。如果返回值为负数,则表示函数注册失败;

        注:其实如果major不为0的话就相当于字符设备中的静态设备号申请,如果为0,则相当于动态设备申请了;


        销毁函数:int unregister_blkdev(unsigned int major, const char *name);

        参数一定要和注册函数的参数匹配,否则出错;


gendisk结构体


结构体

        结构体如下:

struct gendisk {
     /* major, first_minor and minors are input parameters only,
      * don't use directly.  Use disk_devt() and disk_max_parts().
      */
     int major;          /* major number of driver */
     int first_minor;
     int minors;                     /* maximum number of minors, =1 for
                                          * disks that can't be partitioned. */
 
     char disk_name[DISK_NAME_LEN];  /* name of major driver */
     char *(*devnode)(struct gendisk *gd, mode_t *mode);
 
     unsigned int events;        /* supported events */
     unsigned int async_events;  /* async events, subset of all */
 
     /* Array of pointers to partitions indexed by partno.
      * Protected with matching bdev lock but stat and other
      * non-critical accesses use RCU.  Always access through
      * helpers.
      */
     //整个块设备的分区信息都包含在里面,其核心结构是一个struct hd_struct的指针数组,每一项都指向一个描述分区的hd_struct结构
     struct disk_part_tbl __rcu *part_tbl;
     struct hd_struct part0;// 第一个分区的信息,如果没有分区则指向整个设备
 
     const struct block_device_operations *fops;//操作函数指针集合
     struct request_queue *queue;//请求队列
     void *private_data;
 
     int flags;
     struct device *driverfs_dev;  // FIXME: remove
     struct kobject *slave_dir;
 
     struct timer_rand_state *random;
     atomic_t sync_io;       /* RAID */
     struct disk_events *ev;
 #ifdef  CONFIG_BLK_DEV_INTEGRITY
     struct blk_integrity *integrity;
 #endif
     int node_id;
 };

        int  minors 次设备号,一个驱动器至少使用一个次设备号,如果驱动器是可分区的,则为每一个分区分配一个次设备号;

        char  disk_name[32] 设置磁盘设备的名称,该名字在/proc/partitions和sysfs中显示;

        struct  request_queue  *queue 请求队列,为设备管理I/O请求;

        sector_t  capacity  以512为一个扇区,该驱动器可以包含的扇区数,一般通过set_capacity设置;  

        set_capacity(dev->gd,   nsectors*(hardsect_size/KERNEL_SECTOR_SIZE));KERNEL_SECTOR_SIZE是一个常量,使用该常量进行内核的512字节扇区到实际使用扇区大小的转换。


分区表

 //分区列表
 struct disk_part_tbl {
     struct rcu_head rcu_head;
     int len;
     struct hd_struct __rcu *last_lookup;
     struct hd_struct __rcu *part[];
 };


分区结构体

// 分区信息
 struct hd_struct {
     sector_t start_sect;// 当前分区的起始扇区
     sector_t nr_sects;// 分区的大小,多少个扇区
     sector_t alignment_offset;
     unsigned int discard_alignment;
     struct device __dev;// 一个分区对应一个设备
     struct kobject *holder_dir;
     int policy, partno;// partno分区编号
     struct partition_meta_info *info;
 #ifdef CONFIG_FAIL_MAKE_REQUEST
     int make_it_fail;
 #endif
     unsigned long stamp;
     atomic_t in_flight[2];
 #ifdef  CONFIG_SMP
     struct disk_stats __percpu *dkstats;
 #else
     struct disk_stats dkstats;
 #endif
     atomic_t ref;
     struct rcu_head rcu_head;
 };

操作函数指针

        在gendisk结构体中有个fops成员,这个成员是对快设备的操作函数指针的集合,如果熟悉字符设备的,应该不难理解;这个结构体是block_device_operations(相当于字符设备的file_operations结构体);

       

 struct block_device_operations {
     int (*open) (struct block_device *, fmode_t);
     int (*release) (struct gendisk *, fmode_t);
     int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
     int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
     int (*direct_access) (struct block_device *, sector_t,
                         void **, unsigned long *);
     unsigned int (*check_events) (struct gendisk *disk,
                       unsigned int clearing);
     /* ->media_changed() is DEPRECATED, use ->check_events() instead */
     int (*media_changed) (struct gendisk *);
     void (*unlock_native_capacity) (struct gendisk *);
     int (*revalidate_disk) (struct gendisk *);
     int (*getgeo)(struct block_device *, struct hd_geometry *);
     /* this callback is with swap_lock and sometimes page table lock held */
     void (*swap_slot_free_notify) (struct block_device *, unsigned long);
     struct module *owner;
 };
        这里有很多函数指针,但真正使用到的没有多少个。open/release 一般都会使用到(设备的开、关);media_changed/revalidate_disk 检查是否更换了驱动器内的介质/ 如果更换了,则做出响应,告诉驱动程序,作出相应的响应。getgeo函数是获取设置设备参数;


gendisk操作函数

        gendisk结构是一个动态分配的结构,它需要内涵的一些特殊处理来进行初始化,驱动程序不能自己动态的分配该结构体;

        struct  gendisk  *alloc_disk(int  minors);

        参数:minors是该磁盘使用次设备号的数目,以后不能再修改的;


        卸载磁盘:

        void del_gendisk(struct  gendisk *gd);

        gendisk是一个引用计数结构,gen_disk和put_disk函数负责处理引用计数,但驱动不能直接使用这两个函数;

 

        分配了一个gendisk结构并不能使磁盘对系统可用,必须初始化结构体并调用add_disk:

        void add_disk(struct  gendisk *gd);

        一旦调用add_disk,磁盘设备将被“激活”,并随时调用它的提供的方法。当第一次调用这些方法时,add_disk函数可能还没有返回;因为内核可能会调用gendisk结构体中的分区表;


block_device结构

        block_device结构代表了内核中的一个块设备。它可以表示整个磁盘或一个特定的分区。当这个结构代表一个分区时,它的bd_contains成员指向包含这个分区的设备,bd_part成员指向设备的分区结构。当这个结构代表一个块设备时,bd_disk成员指向设备的gendisk结构。这个结构体在ldd3中没有提到;

// 设备(分区)结构体
 struct block_device {
     dev_t           bd_dev;  /* not a kdev_t - it's a search key */
     int         bd_openers;//记录有多少进程打开该设备
     struct inode *      bd_inode;   /* will die */
     struct super_block *    bd_super;
     struct mutex        bd_mutex;   /* open/close mutex */
     struct list_head    bd_inodes;
     void *          bd_claiming;
     void *          bd_holder;
     int         bd_holders;
     bool            bd_write_holder;
 #ifdef CONFIG_SYSFS
     struct list_head    bd_holder_disks;
 #endif
     struct block_device *   bd_contains;// 如果是分区,则指向主设备块;否则指向自己
     unsigned        bd_block_size;
     struct hd_struct *  bd_part;// 指向对应的分区信息
     /* number of times partitions within this device have been opened. */
     unsigned        bd_part_count;// 分区打开的次数,重新扫描则为0
     int         bd_invalidated;
     struct gendisk *    bd_disk;// 指向gendisk
     struct list_head    bd_list;
     /*   
      * Private data.  You must have bd_claim'ed the block_device
      * to use this.  NOTE:  bd_claim allows an owner to claim
      * the same device multiple times, the owner must take special
      * care to not mess up bd_private for that case.
      */
     unsigned long       bd_private;
 
     /* The counter of freeze processes */
     int         bd_fsfreeze_count;
     /* Mutex for freeze */
     struct mutex        bd_fsfreeze_mutex;
 };

request_queue结构体


        gendisk结构体中有个成员queue,这个就是request_queue结构体的实例变量。每一块设备都会有一个队列,当需要对设备操作时,把请求放在队列中。因为对块设备的操作 I/O访问不能及时调用完成,I/O操作比较慢,所以把所有的请求放在队列中,等到合适的时候再处理这些请求;

struct request_queue {
     /*
      * Together with queue_head for cacheline sharing
      */
     struct list_head    queue_head;//待处理请求的链表
     struct request      *last_merge;//指向队列中首先可能合并的描述符
     struct elevator_queue   *elevator;////指向elevator对象的指针(电梯算法)
 
     /*
      * the queue request freelist, one for reads and one for writes
      */
     struct request_list rq;//为分配请求描述符所使用的数据结构
 
     request_fn_proc     *request_fn;//实现驱动程序的策略例程入口点的方法,策略例程方法来处理请求队列中的下一个请求
     make_request_fn     *make_request_fn;//将一个新请求插入请求队列时调用的方法
     prep_rq_fn      *prep_rq_fn; //该方法把这个处理请求的命令发送给硬件设备
     unprep_rq_fn        *unprep_rq_fn;//去掉块设备的方法
     merge_bvec_fn       *merge_bvec_fn; //当增加一个新段时,该方法返回可插人到某个已存在的bio结构中的字节数(通常未定义)
     softirq_done_fn     *softirq_done_fn;
     rq_timed_out_fn     *rq_timed_out_fn;
     dma_drain_needed_fn *dma_drain_needed;
     lld_busy_fn     *lld_busy_fn;
 
     /*
      * Dispatch queue sorting
      */
     sector_t        end_sector;
     struct request      *boundary_rq;
 /*
      * Delayed queue handling
      */
     struct delayed_work delay_work;
 
     struct backing_dev_info backing_dev_info;
 
     /*
      * The queue owner gets to use this for whatever they like.
      * ll_rw_blk doesn't touch it.
      */
     void            *queuedata;//指向块设备驱动程序的私有数据的指针
 
     /*
      * various queue flags, see QUEUE_* below
      */
     unsigned long       queue_flags;//描述请求队列状态的标志
 
     /*
      * queue needs bounce pages for pages above this limit
      */
     gfp_t           bounce_gfp;//回弹缓冲区的内存分配标志
 
     /*
      * protects queue structures from reentrancy. ->__queue_lock should
      * _never_ be used directly, it is queue private. always use
      * ->queue_lock.
      */
     spinlock_t      __queue_lock;
     spinlock_t      *queue_lock;
 /*
      * queue kobject
      */
     struct kobject kobj;
 
     /*
      * queue settings
      */
     unsigned long       nr_requests;    /* Max # of requests */
     unsigned int        nr_congestion_on;//如果待处理请求数超出了该闭值,则认为该队列是拥挤的
     unsigned int        nr_congestion_off;//如果待处理请求数在这个闭值的范围内,则认为该队列是不拥挤的
     unsigned int        nr_batching;//即使队列已满,仍可以由特殊进程“batcher”提交的待处理请求的最大值(通常为32)
 
     unsigned int        dma_drain_size;
     void            *dma_drain_buffer;
     unsigned int        dma_pad_mask;
     unsigned int        dma_alignment;
 
     struct blk_queue_tag    *queue_tags;
     struct list_head    tag_busy_list;
 
     unsigned int        nr_sorted;
     unsigned int        in_flight[2];
 
     unsigned int        rq_timeout;
     struct timer_list   timeout;
     struct list_head    timeout_list;
 
     struct queue_limits limits;
 
     /*
      * sg stuff
      */
 unsigned int        sg_timeout;
     unsigned int        sg_reserved_size;
     int         node;
 #ifdef CONFIG_BLK_DEV_IO_TRACE
     struct blk_trace    *blk_trace;
 #endif
     /*
      * for flush operations
      */
     unsigned int        flush_flags;
     unsigned int        flush_not_queueable:1;
     unsigned int        flush_queue_delayed:1;
     unsigned int        flush_pending_idx:1;
     unsigned int        flush_running_idx:1;
     unsigned long       flush_pending_since;
     struct list_head    flush_queue[2];
     struct list_head    flush_data_in_flight;
     struct request      flush_rq;
 
     struct mutex        sysfs_lock;
 
 #if defined(CONFIG_BLK_DEV_BSG)
     bsg_job_fn      *bsg_job_fn;
     int         bsg_job_size;
     struct bsg_class_device bsg_dev;
 #endif
 
 #ifdef CONFIG_BLK_DEV_THROTTLING
     /* Throttle data */
     struct throtl_data *td;
 #endif
 };


操作函数

        分配队列函数:

        dev->queue = blk_init_queue(request_func, &dev->lock);

        request_func函数指针是请求函数,负责执行块设备的读、写请求。


        内核认为每个磁盘都是由512字节大小的扇区所组成的线性数组;


        所有操作的第一步就是通知内核设备所支持的扇区大小,硬件扇区大小作为一个参数放在请求队列中,而不是放在gendisk结构中;

        blk_queue_hardsect_size(dev->queue,  hardsect_size);// 调用了该函数后,内核对设备使用设定的硬件扇区大小。


request结构体


        request结构体就是请求操作块设备的请求结构体,该结构体被放到request_queue队列中,等到合适的时候再处理。

 struct request {
     struct list_head queuelist; //请求结构体队列链表
     struct call_single_data csd;
 
     struct request_queue *q;//所在的队列
 
     unsigned int cmd_flags;
     enum rq_cmd_type_bits cmd_type;// 命令类型
     unsigned long atomic_flags;
 
     int cpu;
 
     /* the following two fields are internal, NEVER access directly */
     // 下面两个字段从不直接访问
     unsigned int __data_len;    /* total data len */
     sector_t __sector;      /* sector cursor */
 
     struct bio *bio;//请求的bio结构链表,不能直接访问,要使用 rq_for_each_bio来遍历
     struct bio *biotail;//应该是链表的尾部
 
     struct hlist_node hash; /* merge hash */
     /*
      * The rb_node is only used inside the io scheduler, requests
      * are pruned when moved to the dispatch queue. So let the
      * completion_data share space with the rb_node.
      */
     union {
         struct rb_node rb_node; /* sort/lookup */
         void *completion_data;
     };
 /*
      * Three pointers are available for the IO schedulers, if they need
      * more they have to dynamically allocate it.  Flush requests are
      * never put on the IO scheduler. So let the flush fields share
      * space with the three elevator_private pointers.
      */
     union {
         void *elevator_private[3];
         struct {
             unsigned int        seq;
             struct list_head    list;
             rq_end_io_fn        *saved_end_io;
         } flush;
     };
 
     struct gendisk *rq_disk;
     struct hd_struct *part;
     unsigned long start_time;
 #ifdef CONFIG_BLK_CGROUP
     unsigned long long start_time_ns;
     unsigned long long io_start_time_ns;    /* when passed to hardware */
 #endif
     /* Number of scatter-gather DMA addr+len pairs after
      * physical address coalescing is performed.
      */
     unsigned short nr_phys_segments;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
     unsigned short nr_integrity_segments;
 #endif
 
     unsigned short ioprio;
 int ref_count;
 
     void *special;      /* opaque pointer available for LLD use */
     char *buffer;       /* kaddr of the current segment if available */
 
     int tag;
     int errors;
 
     /*
      * when request is used as a packet command carrier
      */
     unsigned char __cmd[BLK_MAX_CDB];
     unsigned char *cmd;
     unsigned short cmd_len;
 
     unsigned int extra_len; /* length of alignment and padding */
     unsigned int sense_len;
     unsigned int resid_len; /* residual count */
     void *sense;
 
     unsigned long deadline;
     struct list_head timeout_list;
     unsigned int timeout;
     int retries;
 
     /*
      * completion callback.
      */
     rq_end_io_fn *end_io;
     void *end_io_data;
 
     /* for bidi */
     struct request *next_rq;
 };


操作函数

        每个块设备驱动程序的核心都是它的请求函数。

        块设备驱动程序的request函数:

        void  request(request_queue_t  *queue);

        当内核需要驱动程序处理读取、写入以及其他设备的操作时,就会调用该函数;在其返回前,request不需要完成队列中所有的请求,事实上,对大都数设备来说,它可能没有完成任何的请求。但是它必须启动对请求的响应,并且保证所有的请求最终都会被驱动程序所处理。也就是说,当有请求来了,调用该函数,把请求结构体放到队列中,但是其实是没有执行的,仅仅是放在队列中告诉驱动有时间就去完成这个请求,而所谓的响应就是表示成功放到了队列中。

        该函数和设备的队列绑定的,在设备队列生成的那一刻就绑定了该请求函数:  dev->queue = blk_init_queue(request,  &dev->lock);


BIO结构体


        bio结构体其实是request结构体的实际数据,一个request结构体中包含一个或者多个bio结构体,在底层实际是按bio来对设备进行操作的。该结构被传递给I/O代码,代码会把它合并到一个已经存在的request结构体中,或者需要的话会再创建一个新的request结构体;bio结构体包含了驱动程序执行请求的全部信息。

struct bio {
     //该bio结构所要传输的第一个(512字节)扇区:磁盘的位置
     sector_t        bi_sector;  /* device address in 512 byte
                            sectors */
     struct bio      *bi_next;   /* request queue link *///请求queue链表==指向下一个bio结构体
     struct block_device *bi_bdev; // 相关设备
     unsigned long       bi_flags;   /* status, command, etc */
     unsigned long       bi_rw;      /* bottom bits READ/WRITE,
                          * top bits priority
                          */// 低位为读/写,高位为优先级
 
     unsigned short      bi_vcnt;    /* how many bio_vec's *///多少个bio_vec数组
     unsigned short      bi_idx;     /* current index into bvl_vec *///现在指向哪个数组元素,相当于偏移量
 
     /* Number of segments in this BIO after
      * physical address coalescing is performed.
      */
     unsigned int        bi_phys_segments;// 合并后的segments数,bio中包含的物理段数目
 
     //剩余的I/O计数,也就是以字节为单位需要传送的数据大小
     unsigned int        bi_size;    /* residual I/O count */
 
     /*
      * To keep track of the max segment size, we account for the
      * sizes of the first and last mergeable segments in this bio.
      */
     unsigned int        bi_seg_front_size;
     unsigned int        bi_seg_back_size;
 
     unsigned int        bi_max_vecs;    /* max bvl_vecs we can hold */
 
     unsigned int        bi_comp_cpu;    /* completion CPU */
 atomic_t        bi_cnt;     /* pin count */
 
     struct bio_vec      *bi_io_vec; /* the actual vec list */// 实际的数组链表
 
     bio_end_io_t        *bi_end_io;
 
     void            *bi_private;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
     struct bio_integrity_payload *bi_integrity;  /* data integrity */
 #endif
 
     bio_destructor_t    *bi_destructor; /* destructor */
 
     /*
      * We can inline a number of vecs at the end of the bio, to avoid
      * double allocations for a small number of bio_vecs. This member
      * MUST obviously be kept at the very end of the bio.
      */
     struct bio_vec      bi_inline_vecs[0];
 };

        sector_t  bi_sector 该bio结构所要传输的第一个扇区;

        unsigned  int bi_size 以字节为单位所需要传输的数据大小,通常要bio_sectors(bio)获得每个扇区的大小

        unsigned  long bi_flags bio中一系列的标志位,如果写请求,最低有效位将被设置,通过 bio_data_dir(bio)查看,不能直接查看;

        unsigned  short  bio_phys_segments 和  unsigned short  bio_hw_segments:当DMA映射完成时,它们分别表示BIO中包含的物理段数目和硬件所能操作的段数目;


        bio结构体的核心是一个名为bi_io_vec的数组,结构体名为:

 struct bio_vec {
     struct page *bv_page;//指向整个缓冲区所驻留的物理页面
     unsigned int    bv_len;//这个缓冲区以字节为单位的大小
     unsigned int    bv_offset;//缓冲区所驻留的页中以字节为单位的偏移量
 };

转载地址:http://blog.csdn.net/yuzhihui_no1/article/details/46763817