improve virtio_disk comments; bring it closer to wording in the spec

This commit is contained in:
Robert Morris 2020-10-05 06:26:58 -04:00 committed by Frans Kaashoek
parent 548ffc97e1
commit 3092fe2c9e
2 changed files with 85 additions and 37 deletions

View file

@ -47,7 +47,8 @@
// must be a power of two. // must be a power of two.
#define NUM 8 #define NUM 8
struct VRingDesc { // a single descriptor, from the spec.
struct virtq_desc {
uint64 addr; uint64 addr;
uint32 len; uint32 len;
uint16 flags; uint16 flags;
@ -56,17 +57,38 @@ struct VRingDesc {
#define VRING_DESC_F_NEXT 1 // chained with another descriptor #define VRING_DESC_F_NEXT 1 // chained with another descriptor
#define VRING_DESC_F_WRITE 2 // device writes (vs read) #define VRING_DESC_F_WRITE 2 // device writes (vs read)
struct VRingUsedElem { // the (entire) avail ring, from the spec.
struct virtq_avail {
uint16 flags; // always zero
uint16 idx; // driver will write ring[idx] next
uint16 ring[NUM]; // descriptor numbers of chain heads
uint16 unused;
};
// one entry in the "used" ring, with which the
// device tells the driver about completed requests.
struct virtq_used_elem {
uint32 id; // index of start of completed descriptor chain uint32 id; // index of start of completed descriptor chain
uint32 len; uint32 len;
}; };
// for disk ops struct virtq_used {
uint16 flags; // always zero
uint16 idx; // device increments when it adds a ring[] entry
struct virtq_used_elem ring[NUM];
};
// these are specific to virtio block devices, e.g. disks,
// described in Section 5.2 of the spec.
#define VIRTIO_BLK_T_IN 0 // read the disk #define VIRTIO_BLK_T_IN 0 // read the disk
#define VIRTIO_BLK_T_OUT 1 // write the disk #define VIRTIO_BLK_T_OUT 1 // write the disk
struct UsedArea { // the format of the first descriptor in a disk request.
uint16 flags; // to be followed by two more descriptors containing
uint16 id; // the block, and a one-byte status.
struct VRingUsedElem elems[NUM]; struct virtio_blk_req {
uint32 type; // VIRTIO_BLK_T_IN or ..._OUT
uint32 reserved;
uint64 sector;
}; };

View file

@ -21,14 +21,37 @@
#define R(r) ((volatile uint32 *)(VIRTIO0 + (r))) #define R(r) ((volatile uint32 *)(VIRTIO0 + (r)))
static struct disk { static struct disk {
// memory for virtio descriptors &c for queue 0. // the virtio driver and device mostly communicate through a set of
// this is a global instead of allocated because it must // structures in RAM. pages[] allocates that memory. pages[] is a
// be multiple contiguous pages, which kalloc() // global (instead of calls to kalloc()) because it must consist of
// doesn't support, and page aligned. // two contiguous pages of page-aligned physical memory.
char pages[2*PGSIZE]; char pages[2*PGSIZE];
struct VRingDesc *desc;
uint16 *avail; // pages[] is divided into three regions (descriptors, avail, and
struct UsedArea *used; // used), as explained in Section 2.6 of the virtio specification
// for the legacy interface.
// https://docs.oasis-open.org/virtio/virtio/v1.1/virtio-v1.1.pdf
// the first region of pages[] is a set (not a ring) of DMA
// descriptors, with which the driver tells the device where to read
// and write individual disk operations. there are NUM descriptors.
// most commands consist of a "chain" (a linked list) of a couple of
// these descriptors.
// points into pages[].
struct virtq_desc *desc;
// next is a ring in which the driver writes descriptor numbers
// that the driver would like the device to process. it only
// includes the head descriptor of each chain. the ring has
// NUM elements.
// points into pages[].
struct virtq_avail *avail;
// finally a ring in which the device writes descriptor numbers that
// the device has finished processing (just the head of each chain).
// there are NUM used ring entries.
// points into pages[].
struct virtq_used *used;
// our own book-keeping. // our own book-keeping.
char free[NUM]; // is a descriptor free? char free[NUM]; // is a descriptor free?
@ -98,14 +121,15 @@ virtio_disk_init(void)
memset(disk.pages, 0, sizeof(disk.pages)); memset(disk.pages, 0, sizeof(disk.pages));
*R(VIRTIO_MMIO_QUEUE_PFN) = ((uint64)disk.pages) >> PGSHIFT; *R(VIRTIO_MMIO_QUEUE_PFN) = ((uint64)disk.pages) >> PGSHIFT;
// desc = pages -- num * VRingDesc // desc = pages -- num * virtq_desc
// avail = pages + 0x40 -- 2 * uint16, then num * uint16 // avail = pages + 0x40 -- 2 * uint16, then num * uint16
// used = pages + 4096 -- 2 * uint16, then num * vRingUsedElem // used = pages + 4096 -- 2 * uint16, then num * vRingUsedElem
disk.desc = (struct VRingDesc *) disk.pages; disk.desc = (struct virtq_desc *) disk.pages;
disk.avail = (uint16*)(((char*)disk.desc) + NUM*sizeof(struct VRingDesc)); disk.avail = (struct virtq_avail *)(disk.pages + NUM*sizeof(struct virtq_desc));
disk.used = (struct UsedArea *) (disk.pages + PGSIZE); disk.used = (struct virtq_used *) (disk.pages + PGSIZE);
// all NUM descriptors start out unused.
for(int i = 0; i < NUM; i++) for(int i = 0; i < NUM; i++)
disk.free[i] = 1; disk.free[i] = 1;
@ -156,6 +180,8 @@ free_chain(int i)
} }
} }
// allocate three descriptors (they need not be contiguous).
// disk transfers always use three descriptors.
static int static int
alloc3_desc(int *idx) alloc3_desc(int *idx)
{ {
@ -177,9 +203,9 @@ virtio_disk_rw(struct buf *b, int write)
acquire(&disk.vdisk_lock); acquire(&disk.vdisk_lock);
// the spec says that legacy block operations use three // the spec's Section 5.2 says that legacy block operations use
// descriptors: one for type/reserved/sector, one for // three descriptors: one for type/reserved/sector, one for the
// the data, one for a 1-byte status result. // data, one for a 1-byte status result.
// allocate the three descriptors. // allocate the three descriptors.
int idx[3]; int idx[3];
@ -193,11 +219,7 @@ virtio_disk_rw(struct buf *b, int write)
// format the three descriptors. // format the three descriptors.
// qemu's virtio-blk.c reads them. // qemu's virtio-blk.c reads them.
struct virtio_blk_outhdr { struct virtio_blk_req buf0;
uint32 type;
uint32 reserved;
uint64 sector;
} buf0;
if(write) if(write)
buf0.type = VIRTIO_BLK_T_OUT; // write the disk buf0.type = VIRTIO_BLK_T_OUT; // write the disk
@ -232,15 +254,13 @@ virtio_disk_rw(struct buf *b, int write)
b->disk = 1; b->disk = 1;
disk.info[idx[0]].b = b; disk.info[idx[0]].b = b;
// avail[0] is flags (always zero) // tell the device the first index in our chain of descriptors.
// avail[1] is an index into avail[2...] telling where we'll write next disk.avail->ring[disk.avail->idx % NUM] = idx[0];
// avail[2...] is a ring of NUM indices the device should process
// we only tell device the first index in our chain of descriptors.
disk.avail[2 + (disk.avail[1] % NUM)] = idx[0];
__sync_synchronize(); __sync_synchronize();
disk.avail[1] = disk.avail[1] + 1; // not % NUM ... // tell the device another avail ring entry is available.
disk.avail->idx += 1; // not % NUM ...
__sync_synchronize(); __sync_synchronize();
@ -262,16 +282,22 @@ virtio_disk_intr()
{ {
acquire(&disk.vdisk_lock); acquire(&disk.vdisk_lock);
// this ack may race with the device writing new notifications to // the device won't raise another interrupt until we tell it
// the "used" ring, in which case we may get an interrupt we don't // we've seen this interrupt, which the following line does.
// need, which is harmless. // this may race with the device writing new entries to
// the "used" ring, in which case we may process the new
// completion entries in this interrupt, and have nothing to do
// in the next interrupt, which is harmless.
*R(VIRTIO_MMIO_INTERRUPT_ACK) = *R(VIRTIO_MMIO_INTERRUPT_STATUS) & 0x3; *R(VIRTIO_MMIO_INTERRUPT_ACK) = *R(VIRTIO_MMIO_INTERRUPT_STATUS) & 0x3;
__sync_synchronize(); __sync_synchronize();
while(disk.used_idx != disk.used->id){ // the device increments disk.used->idx when it
// adds an entry to the used ring.
while(disk.used_idx != disk.used->idx){
__sync_synchronize(); __sync_synchronize();
int id = disk.used->elems[disk.used_idx % NUM].id; int id = disk.used->ring[disk.used_idx % NUM].id;
if(disk.info[id].status != 0) if(disk.info[id].status != 0)
panic("virtio_disk_intr status"); panic("virtio_disk_intr status");