Open-Channel SSD

Xing Lin published on 2019-08-13

In Linux pblk implementation, a line corresponds to a chunk which is one erase block. A LUN is a PU. A group is a channel.
Eventually, pblk calls pblk_submit_read() and pblk_submit_write() for reads/writes to the device.
Read code path (from the media) in pblk

1
2
3
4
5
6
7
8
9


/* pblk-read.c */

pblk_submit_read()
  pblk_read_ppalist_rq();
  or pblk_read_rq(); 
    pblk_lookup_l2p_seq();           // get physical addresses
  pblk_submit_io(pblk, rqd);
    nvm_submit_io(dev, rqd);
      dev->ops->submit_io(dev, rqd);

Write code path in pblk

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45


pblk_write_to_cache() {

 	for (i = 0; i < nr_entries; i++) {
		void *data = bio_data(bio);

		w_ctx.lba = lba + i;

		pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + i);
		pblk_rb_write_entry_user(&pblk->rwb, data, w_ctx, pos);

		bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
	}

	pblk_write_should_kick(struct pblk *pblk);
}

/* pblk-rb.c */
void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
	      struct pblk_w_ctx w_ctx, unsigned int ring_pos)
{
	__pblk_rb_write_entry(rb, data, w_ctx, entry);
	pblk_update_map_cache(pblk, w_ctx.lba, entry->cacheline);
}

/* pblk-core.c */
void pblk_write_should_kick(struct pblk *pblk)
{
	unsigned int secs_avail = pblk_rb_read_count(&pblk->rwb);

	if (secs_avail >= pblk->min_write_pgs_data)
		pblk_write_kick(pblk);
}

pblk_write_kick()
 wake_up_process(pblk->writer_ts);
  pblk_write_ts() 	/* pblk-write.c */
   pblk_submit_write(pblk, $secs_left);

pblk_submit_write(struct pblk* pblk, int *secs_left)
 pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync, secs_avail)
 pblk_submit_io_set(pblk, rqd);
  pblk_setup_w_rq(pblk, rqd, &erase_ppa);   // will call pblk_map_rq() to allocate new physical pages; will also call pblk_line_erase() to first erase if needed.
  pblk_submit_io(pblk, rqd);
    nvm_submit_io(dev, rqd);
      dev->ops->submit_io(dev, rqd);

Garbage Collection code path in pblk Erase operations are done during writes (when flushing the write buffer to the media).

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43


/* pblk-gc.c */
int pblk_gc_init(struct pblk *pblk)
{
 gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts");
 gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk, 
							"pblk-gc-writer-ts");
 gc->gc_reader_ts = kthread_create(pblk_gc_reader_ts, pblk,
							"pblk-gc-reader-ts");

/* Workqueue that reads valid sectors from a line and submit them to the
 * GC writer to be recycled.
 */
gc->gc_line_reader_wq = alloc_workqueue("pblk-gc-line-reader-wq",
			WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_GC_MAX_READERS);

/* Workqueue that prepare lines for GC */
gc->gc_reader_wq = alloc_workqueue("pblk-gc-line_wq",
				WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
}


pblk_gc_ts()
 pblk_gc_run()
  pblk_gc_get_victim_line()
  pblk_gc_reader_kick()

pblk_gc_reader_ts()
 pblk_gc_read()
  pblk_gc_line()
   pblk_gc_line_prepare_ws()
    pblk_gc_line_ws();
     pblk_submit_read_gc(pblk, gc_rq);
      read_ppalist_rq_gc();           // find all nonempty pages.
      pblk_submit_io_sync();
     list_add_tail(&gc_rq->list, &gc->w_list);	// add nonempty pages to the gc->w_list
     pblk_gc_writer_kick(&pblk->gc);		// kick writer to do writes
    kref_put(&line->ref, pblk_line_put);        // add this line back to free list, if there is no reference.
pblk_gc_writer_ts()
 pblk_gc_write(pblk)
  pblk_write_gc_to_cache(pblk, gc_rq);
   pblk_rb_write_entry_gc();
    __pblk_rb_write_entry();
    pblk_update_map_gc();

Scratch space

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132


/* pblk-init.c */

pblk_make_rq()
  if (bio_op(bio) == REQ_OP_DISCARD) 
	pblk_discard(pblk, bio);

  if (bio_data_dir(bio) == READ) {
	blk_queue_split(q, &bio);
	pblk_submit_read(pblk, bio);
  } else {
	if (pblk_get_secs(bio) > pblk_rl_max_io(&pblk->rl))
		blk_queue_split(q, &bio);

	pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER);
  }

/* get phyiscal address fro logical address */
ppa = pblk_trans_map_get(pblk, lba)

int pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
			 sector_t blba, int nr_secs, bool *from_cache)
void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
			  u64 *lba_list, int nr_secs)

/* pblk-read.c */

pblk_submit_read()
  pblk_read_ppalist_rq();
  or pblk_read_rq(); 
    pblk_lookup_l2p_seq();           // get physical addresses
  pblk_submit_io(pblk, rqd);
    nvm_submit_io(dev, rqd);
      dev->ops->submit_io(dev, rqd);

/* pblk-write.c */
pblk_submit_write(struct pblk* pblk, int *secs_left)
 pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync, secs_avail)
 pblk_submit_io_set(pblk, rqd);
  pblk_setup_w_rq(pblk, rqd, &erase_ppa);   // will call pblk_map_rq() to allocate physical pages
  pblk_submit_io(pblk, rqd);

pblk_setup_w_rq()
 pblk_map_rq()       // allocate physical pages for new writes.

/* Allocate new physical pages from an erase block */
u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs)


/* lightnvm.c */
static struct nvm_dev_ops nvme_nvm_dev_ops = {
	.identity		= nvme_nvm_identity,

	.get_bb_tbl		= nvme_nvm_get_bb_tbl,
	.set_bb_tbl		= nvme_nvm_set_bb_tbl,

	.get_chk_meta		= nvme_nvm_get_chk_meta,

	.submit_io		= nvme_nvm_submit_io,
	.submit_io_sync		= nvme_nvm_submit_io_sync,

	.create_dma_pool	= nvme_nvm_create_dma_pool,
	.destroy_dma_pool	= nvme_nvm_destroy_dma_pool,
	.dev_dma_alloc		= nvme_nvm_dev_dma_alloc,
	.dev_dma_free		= nvme_nvm_dev_dma_free,
};

/* pblk-cache.c */
pblk_write_to_cache() {

 	for (i = 0; i < nr_entries; i++) {
		void *data = bio_data(bio);

		w_ctx.lba = lba + i;

		pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + i);
		pblk_rb_write_entry_user(&pblk->rwb, data, w_ctx, pos);

		bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
	}

	pblk_write_should_kick(struct pblk *pblk);
}

/* pblk-rb.c */
void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
	      struct pblk_w_ctx w_ctx, unsigned int ring_pos)
{
	__pblk_rb_write_entry(rb, data, w_ctx, entry);
	pblk_update_map_cache(pblk, w_ctx.lba, entry->cacheline);
}

/* pblk-core.c */
void pblk_write_should_kick(struct pblk *pblk)
{
	unsigned int secs_avail = pblk_rb_read_count(&pblk->rwb);

	if (secs_avail >= pblk->min_write_pgs_data)
		pblk_write_kick(pblk);
}

pblk_write_kick()
 wake_up_process(pblk->writer_ts);
  pblk_write_ts() 	/* pblk-write.c */
   pblk_submit_write(pblk, $secs_left);



/* pblk-map.c */
pblk_map_rq()
 pblk_map_page_data()
  pblk_alloc_page()

/* pblk-core.c */
pblk_blk_erase_sync
pblk_blk_erase_async


pblk_setup_w_rq()
 pblk_map_rq();
 or pblk_map_erase_rq();
   pblk_map_page_data()
     if (pblk_line_is_full(line)) 
       line = pblk_line_replace_data(pblk);
         pblk_line_erase(pblk, new);

pblk_map_rq()
 pblk_map_page_data()

pblk_map_erase_rq()
 pblk_map_page_data()
  if (pblk_line_is_full(line)) 
    line = pblk_line_replace_data(pblk);

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20


/* pblk.h */

/* a line is an erase block. */

enum {
	/* Line Types */
	PBLK_LINETYPE_FREE = 0,
	PBLK_LINETYPE_LOG = 1,
	PBLK_LINETYPE_DATA = 2,

	/* Line state */
	PBLK_LINESTATE_NEW = 9,
	PBLK_LINESTATE_FREE = 10,
	PBLK_LINESTATE_OPEN = 11,
	PBLK_LINESTATE_CLOSED = 12,
	PBLK_LINESTATE_GC = 13,
	PBLK_LINESTATE_BAD = 14,
	PBLK_LINESTATE_CORRUPT = 15,

};

Create The Author Version for Your Paper (ACM)

Xing Lin published on 2019-04-28

So you have your paper accepted to a conference and completed your camera ready version. Now is the time to create an author version for it so that you can post it on your own web site! The trick is actually quite simple if you are using the ACM Latex template.

In acmart.cls, change the execution option for authorversion from false to true.

-\ExecuteOptionsX{authorversion=false}
+\ExecuteOptionsX{authorversion=true}

You may also want to add page numbers into your paper.

Programming - Principles and Practice Using C++

Xing Lin published on 2019-04-25

This is an introductory book about programming, written by Bjarne Stroustrup, the C++ creator.

Quotes

Programming is (among other things) a practical skill that you need to practice to master.
Learning involves repetition.
You must run before you can walk! Babies really do run by themselves before they learn the finer skills of slow, controlled walking. Similarly, you will dash ahead, occasionally stumbling, to get a feel of programming before slowing down to gain the necessary finer control and understanding.

Coordinating Garbage Collection for Arrays of Solid-State Drives

Xing Lin published on 2019-03-08

This is the start of a series of blogs I plan to write about existing work related with garbage collection for SSDs.

The main idea of this paper is to run garbage collection during idle times to minimize the impact on foreground workloads. Garbage collection is scheduled to run simultaneously at all SSDs, to maximize the time window during which there is no garbage collection, and thus higher application performance. Figure 5 shows their approach and Figure 10 shows the effects.

DAX in ext2 filesystem

Xing Lin published on 2019-01-24

msync syscall

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41


/mm/msync.c

/ * MS_SYNC syncs the entire file - including mappings. */
SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
    vfs_fsync_range(file, fstart, fend, 1);
        file->f_op->fsync(file, start, end, datasync);
        == ext2_fsync();
            generic_file_fsync(file, start, end, datasync);
                __generic_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
                    file_write_and_wait_range(file, start, end);
                        __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL);
                            do_writepages(mapping, &wbc);
                                mapping->a_ops->writepages(mapping, wbc);

ext2_dax_writepages()
    dax_writeback_mapping_range(mapping, mapping->host->i_sb->s_bdev, wbc);
        dax_writeback_one(&xas, dax_dev, mapping, entry);
            dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size);
                arch_wb_cache_pmem(addr, size);
                    clean_cache_range(addr, size);
                        for (p = (void *)((unsigned long)addr & ~clflush_mask); p < vend; p += x86_clflush_size)
                            clwb(p);

static const struct address_space_operations ext2_dax_aops = {
	.writepages		= ext2_dax_writepages,
	.direct_IO		= noop_direct_IO,
	.set_page_dirty		= noop_set_page_dirty,
	.invalidatepage		= noop_invalidatepage,
};

void ext2_set_file_ops(struct inode *inode)
{
	inode->i_op = &ext2_file_inode_operations;
	inode->i_fop = &ext2_file_operations;
	if (IS_DAX(inode))
		inode->i_mapping->a_ops = &ext2_dax_aops;
	else if (test_opt(inode->i_sb, NOBH))
		inode->i_mapping->a_ops = &ext2_nobh_aops;
	else
		inode->i_mapping->a_ops = &ext2_aops;
}

nvdimm driver

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101


/drivers/nvdimm/pmem.h

/drivers/nvdimm/pmem.c

pmem_dax_direct_access()
    __pmem_direct_access()

pmem_copy_from_iter()
    copy_from_iter_flushcache(addr, bytes, i);
        _copy_from_iter_flushcache(addr, bytes, i);
            iterate_and_advance(i, bytes, v,
                __copy_from_user_flushcache((to += v.iov_len) - v.iov_len,
                    v.iov_base, v.iov_len),
                memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page,
                    v.bv_offset, v.bv_len),
                memcpy_flushcache((to += v.iov_len) - v.iov_len, v.iov_base,
                    v.iov_len)
            )


pmem_rw_page(struct block_device *bdev, sector_t sector, struct page *page, unsigned int op);
    pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE, 0, op, sector);
        if (!op_is_write(op)) {
            read_pmem(page, off, pmem_addr, len);
            flush_dcache_page(page);
        } else {
            flush_dcache_page(page);
            write_pmem(pmem_addr, page, off, len);
        }  

write_pmem()
    memcpy_flushcache()
        __memcpy_flushcache(dst, src, cnt);
            /* assembly code ... */
            while (size >= 32) {
                asm("movq    (%0), %%r8\n"
                "movq   8(%0), %%r9\n"
                "movq  16(%0), %%r10\n"
                "movq  24(%0), %%r11\n"
                "movnti  %%r8,   (%1)\n"
                "movnti  %%r9,  8(%1)\n"
				...
                :: "r" (source), "r" (dest)
                : "memory", "r8", "r9", "r10", "r11");
                dest += 32;
                source += 32;
                size -= 32;
            }


static void write_pmem(void *pmem_addr, struct page *page,
		unsigned int off, unsigned int len)
{
	unsigned int chunk;
	void *mem;

	while (len) {
		mem = kmap_atomic(page);
		chunk = min_t(unsigned int, len, PAGE_SIZE);
		memcpy_flushcache(pmem_addr, mem + off, chunk);
		kunmap_atomic(mem);
		len -= chunk;
		off = 0;
		page++;
		pmem_addr += PAGE_SIZE;
	}
}

static blk_status_t read_pmem(struct page *page, unsigned int off,
		void *pmem_addr, unsigned int len)
{
	unsigned int chunk;
	unsigned long rem;
	void *mem;

	while (len) {
		mem = kmap_atomic(page);
		chunk = min_t(unsigned int, len, PAGE_SIZE);
		rem = memcpy_mcsafe(mem + off, pmem_addr, chunk);
		kunmap_atomic(mem);
		if (rem)
			return BLK_STS_IOERR;
		len -= chunk;
		off = 0;
		page++;
		pmem_addr += PAGE_SIZE;
	}
	return BLK_STS_OK;
}

static const struct block_device_operations pmem_fops = {
	.owner =		THIS_MODULE,
	.rw_page =		pmem_rw_page,
	.revalidate_disk =	nvdimm_revalidate_disk,
};

static const struct dax_operations pmem_dax_ops = {
	.direct_access = pmem_dax_direct_access,
	.copy_from_iter = pmem_copy_from_iter,
	.copy_to_iter = pmem_copy_to_iter,
};

ext2

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33


# /fs/ext2/file.c

ext2_file_read_iter()
    ext2_dax_read_iter()
        dax_iomap_rw();
            iomap_apply();
                dax_iomap_actor();
                    dax_direct_access();
                    dax_copy_from_iter();
                    dax_copy_to_iter();
                        dax_dev->ops->copy_from_iter(dax_dev, pgoff, addr, bytes, i);


static const struct vm_operations_struct ext2_dax_vm_ops = {
	.fault		= ext2_dax_fault,
	/*
	 * .huge_fault is not supported for DAX because allocation in ext2
	 * cannot be reliably aligned to huge page sizes and so pmd faults
	 * will always fail and fail back to regular faults.
	 */
	.page_mkwrite	= ext2_dax_fault,
	.pfn_mkwrite	= ext2_dax_fault,
};

static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
{
	if (!IS_DAX(file_inode(file)))
		return generic_file_mmap(file, vma);

	file_accessed(file);
	vma->vm_ops = &ext2_dax_vm_ops;
	return 0;
}

Wisdom from Michael Stonebraker

Xing Lin published on 2019-01-17

Where do ideas come from?

Ideas come from two sources: talking to real users with real problems and then trying to solve them. This ensures somebody cares about the ideas and the rubber meets the road and not the sky. The second source is to bounce possibly good (or bad) ideas off colleages that will challenge them. In summary, the best chance for generating a good idea is to spend time in the real world and find an enviroment where you will be intellectually challenged.