···407407 ret == -ENOIOCTLCMD;408408}409409410410-#ifdef CONFIG_FS_DAX411411-bool blkdev_dax_capable(struct block_device *bdev)412412-{413413- struct gendisk *disk = bdev->bd_disk;414414-415415- if (!disk->fops->direct_access)416416- return false;417417-418418- /*419419- * If the partition is not aligned on a page boundary, we can't420420- * do dax I/O to it.421421- */422422- if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))423423- || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))424424- return false;425425-426426- /*427427- * If the device has known bad blocks, force all I/O through the428428- * driver / page cache.429429- *430430- * TODO: support finer grained dax error handling431431- */432432- if (disk->bb && disk->bb->count)433433- return false;434434-435435- return true;436436-}437437-#endif438438-439410static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,440411 unsigned cmd, unsigned long arg)441412{···569598 case BLKTRACESETUP:570599 case BLKTRACETEARDOWN:571600 return blk_trace_ioctl(bdev, cmd, argp);572572- case BLKDAXGET:573573- return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX));574574- break;575601 case IOC_PR_REGISTER:576602 return blkdev_pr_register(bdev, argp);577603 case IOC_PR_RESERVE:
···11+menuconfig DEV_DAX22+ tristate "DAX: direct access to differentiated memory"33+ default m if NVDIMM_DAX44+ depends on TRANSPARENT_HUGEPAGE55+ help66+ Support raw access to differentiated (persistence, bandwidth,77+ latency...) memory via an mmap(2) capable character88+ device. Platform firmware or a device driver may identify a99+ platform memory resource that is differentiated from the1010+ baseline memory pool. Mappings of a /dev/daxX.Y device impose1111+ restrictions that make the mapping behavior deterministic.1212+1313+if DEV_DAX1414+1515+config DEV_DAX_PMEM1616+ tristate "PMEM DAX: direct access to persistent memory"1717+ depends on NVDIMM_DAX1818+ default DEV_DAX1919+ help2020+ Support raw access to persistent memory. Note that this2121+ driver consumes memory ranges allocated and exported by the2222+ libnvdimm sub-system.2323+2424+ Say Y if unsure2525+2626+endif
···11+/*22+ * Copyright(c) 2016 Intel Corporation. All rights reserved.33+ *44+ * This program is free software; you can redistribute it and/or modify55+ * it under the terms of version 2 of the GNU General Public License as66+ * published by the Free Software Foundation.77+ *88+ * This program is distributed in the hope that it will be useful, but99+ * WITHOUT ANY WARRANTY; without even the implied warranty of1010+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU1111+ * General Public License for more details.1212+ */1313+#include <linux/pagemap.h>1414+#include <linux/module.h>1515+#include <linux/device.h>1616+#include <linux/pfn_t.h>1717+#include <linux/slab.h>1818+#include <linux/dax.h>1919+#include <linux/fs.h>2020+#include <linux/mm.h>2121+2222+static int dax_major;2323+static struct class *dax_class;2424+static DEFINE_IDA(dax_minor_ida);2525+2626+/**2727+ * struct dax_region - mapping infrastructure for dax devices2828+ * @id: kernel-wide unique region for a memory range2929+ * @base: linear address corresponding to @res3030+ * @kref: to pin while other agents have a need to do lookups3131+ * @dev: parent device backing this region3232+ * @align: allocation and mapping alignment for child dax devices3333+ * @res: physical address range of the region3434+ * @pfn_flags: identify whether the pfns are paged back or not3535+ */3636+struct dax_region {3737+ int id;3838+ struct ida ida;3939+ void *base;4040+ struct kref kref;4141+ struct device *dev;4242+ unsigned int align;4343+ struct resource res;4444+ unsigned long pfn_flags;4545+};4646+4747+/**4848+ * struct dax_dev - subdivision of a dax region4949+ * @region - parent region5050+ * @dev - device backing the character device5151+ * @kref - enable this data to be tracked in filp->private_data5252+ * @alive - !alive + rcu grace period == no new mappings can be established5353+ * @id - child id in the region5454+ * @num_resources - number of physical address extents in this device5555+ * @res - array of physical address ranges5656+ */5757+struct dax_dev {5858+ struct dax_region *region;5959+ struct device *dev;6060+ struct kref kref;6161+ bool alive;6262+ int id;6363+ int num_resources;6464+ struct resource res[0];6565+};6666+6767+static void dax_region_free(struct kref *kref)6868+{6969+ struct dax_region *dax_region;7070+7171+ dax_region = container_of(kref, struct dax_region, kref);7272+ kfree(dax_region);7373+}7474+7575+void dax_region_put(struct dax_region *dax_region)7676+{7777+ kref_put(&dax_region->kref, dax_region_free);7878+}7979+EXPORT_SYMBOL_GPL(dax_region_put);8080+8181+static void dax_dev_free(struct kref *kref)8282+{8383+ struct dax_dev *dax_dev;8484+8585+ dax_dev = container_of(kref, struct dax_dev, kref);8686+ dax_region_put(dax_dev->region);8787+ kfree(dax_dev);8888+}8989+9090+static void dax_dev_put(struct dax_dev *dax_dev)9191+{9292+ kref_put(&dax_dev->kref, dax_dev_free);9393+}9494+9595+struct dax_region *alloc_dax_region(struct device *parent, int region_id,9696+ struct resource *res, unsigned int align, void *addr,9797+ unsigned long pfn_flags)9898+{9999+ struct dax_region *dax_region;100100+101101+ dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL);102102+103103+ if (!dax_region)104104+ return NULL;105105+106106+ memcpy(&dax_region->res, res, sizeof(*res));107107+ dax_region->pfn_flags = pfn_flags;108108+ kref_init(&dax_region->kref);109109+ dax_region->id = region_id;110110+ ida_init(&dax_region->ida);111111+ dax_region->align = align;112112+ dax_region->dev = parent;113113+ dax_region->base = addr;114114+115115+ return dax_region;116116+}117117+EXPORT_SYMBOL_GPL(alloc_dax_region);118118+119119+static ssize_t size_show(struct device *dev,120120+ struct device_attribute *attr, char *buf)121121+{122122+ struct dax_dev *dax_dev = dev_get_drvdata(dev);123123+ unsigned long long size = 0;124124+ int i;125125+126126+ for (i = 0; i < dax_dev->num_resources; i++)127127+ size += resource_size(&dax_dev->res[i]);128128+129129+ return sprintf(buf, "%llu\n", size);130130+}131131+static DEVICE_ATTR_RO(size);132132+133133+static struct attribute *dax_device_attributes[] = {134134+ &dev_attr_size.attr,135135+ NULL,136136+};137137+138138+static const struct attribute_group dax_device_attribute_group = {139139+ .attrs = dax_device_attributes,140140+};141141+142142+static const struct attribute_group *dax_attribute_groups[] = {143143+ &dax_device_attribute_group,144144+ NULL,145145+};146146+147147+static void unregister_dax_dev(void *_dev)148148+{149149+ struct device *dev = _dev;150150+ struct dax_dev *dax_dev = dev_get_drvdata(dev);151151+ struct dax_region *dax_region = dax_dev->region;152152+153153+ dev_dbg(dev, "%s\n", __func__);154154+155155+ /*156156+ * Note, rcu is not protecting the liveness of dax_dev, rcu is157157+ * ensuring that any fault handlers that might have seen158158+ * dax_dev->alive == true, have completed. Any fault handlers159159+ * that start after synchronize_rcu() has started will abort160160+ * upon seeing dax_dev->alive == false.161161+ */162162+ dax_dev->alive = false;163163+ synchronize_rcu();164164+165165+ get_device(dev);166166+ device_unregister(dev);167167+ ida_simple_remove(&dax_region->ida, dax_dev->id);168168+ ida_simple_remove(&dax_minor_ida, MINOR(dev->devt));169169+ put_device(dev);170170+ dax_dev_put(dax_dev);171171+}172172+173173+int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,174174+ int count)175175+{176176+ struct device *parent = dax_region->dev;177177+ struct dax_dev *dax_dev;178178+ struct device *dev;179179+ int rc, minor;180180+ dev_t dev_t;181181+182182+ dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL);183183+ if (!dax_dev)184184+ return -ENOMEM;185185+ memcpy(dax_dev->res, res, sizeof(*res) * count);186186+ dax_dev->num_resources = count;187187+ kref_init(&dax_dev->kref);188188+ dax_dev->alive = true;189189+ dax_dev->region = dax_region;190190+ kref_get(&dax_region->kref);191191+192192+ dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL);193193+ if (dax_dev->id < 0) {194194+ rc = dax_dev->id;195195+ goto err_id;196196+ }197197+198198+ minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL);199199+ if (minor < 0) {200200+ rc = minor;201201+ goto err_minor;202202+ }203203+204204+ dev_t = MKDEV(dax_major, minor);205205+ dev = device_create_with_groups(dax_class, parent, dev_t, dax_dev,206206+ dax_attribute_groups, "dax%d.%d", dax_region->id,207207+ dax_dev->id);208208+ if (IS_ERR(dev)) {209209+ rc = PTR_ERR(dev);210210+ goto err_create;211211+ }212212+ dax_dev->dev = dev;213213+214214+ rc = devm_add_action(dax_region->dev, unregister_dax_dev, dev);215215+ if (rc) {216216+ unregister_dax_dev(dev);217217+ return rc;218218+ }219219+220220+ return 0;221221+222222+ err_create:223223+ ida_simple_remove(&dax_minor_ida, minor);224224+ err_minor:225225+ ida_simple_remove(&dax_region->ida, dax_dev->id);226226+ err_id:227227+ dax_dev_put(dax_dev);228228+229229+ return rc;230230+}231231+EXPORT_SYMBOL_GPL(devm_create_dax_dev);232232+233233+/* return an unmapped area aligned to the dax region specified alignment */234234+static unsigned long dax_dev_get_unmapped_area(struct file *filp,235235+ unsigned long addr, unsigned long len, unsigned long pgoff,236236+ unsigned long flags)237237+{238238+ unsigned long off, off_end, off_align, len_align, addr_align, align;239239+ struct dax_dev *dax_dev = filp ? filp->private_data : NULL;240240+ struct dax_region *dax_region;241241+242242+ if (!dax_dev || addr)243243+ goto out;244244+245245+ dax_region = dax_dev->region;246246+ align = dax_region->align;247247+ off = pgoff << PAGE_SHIFT;248248+ off_end = off + len;249249+ off_align = round_up(off, align);250250+251251+ if ((off_end <= off_align) || ((off_end - off_align) < align))252252+ goto out;253253+254254+ len_align = len + align;255255+ if ((off + len_align) < off)256256+ goto out;257257+258258+ addr_align = current->mm->get_unmapped_area(filp, addr, len_align,259259+ pgoff, flags);260260+ if (!IS_ERR_VALUE(addr_align)) {261261+ addr_align += (off - addr_align) & (align - 1);262262+ return addr_align;263263+ }264264+ out:265265+ return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);266266+}267267+268268+static int __match_devt(struct device *dev, const void *data)269269+{270270+ const dev_t *devt = data;271271+272272+ return dev->devt == *devt;273273+}274274+275275+static struct device *dax_dev_find(dev_t dev_t)276276+{277277+ return class_find_device(dax_class, NULL, &dev_t, __match_devt);278278+}279279+280280+static int dax_dev_open(struct inode *inode, struct file *filp)281281+{282282+ struct dax_dev *dax_dev = NULL;283283+ struct device *dev;284284+285285+ dev = dax_dev_find(inode->i_rdev);286286+ if (!dev)287287+ return -ENXIO;288288+289289+ device_lock(dev);290290+ dax_dev = dev_get_drvdata(dev);291291+ if (dax_dev) {292292+ dev_dbg(dev, "%s\n", __func__);293293+ filp->private_data = dax_dev;294294+ kref_get(&dax_dev->kref);295295+ inode->i_flags = S_DAX;296296+ }297297+ device_unlock(dev);298298+299299+ if (!dax_dev) {300300+ put_device(dev);301301+ return -ENXIO;302302+ }303303+ return 0;304304+}305305+306306+static int dax_dev_release(struct inode *inode, struct file *filp)307307+{308308+ struct dax_dev *dax_dev = filp->private_data;309309+ struct device *dev = dax_dev->dev;310310+311311+ dev_dbg(dax_dev->dev, "%s\n", __func__);312312+ dax_dev_put(dax_dev);313313+ put_device(dev);314314+315315+ return 0;316316+}317317+318318+static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma,319319+ const char *func)320320+{321321+ struct dax_region *dax_region = dax_dev->region;322322+ struct device *dev = dax_dev->dev;323323+ unsigned long mask;324324+325325+ if (!dax_dev->alive)326326+ return -ENXIO;327327+328328+ /* prevent private / writable mappings from being established */329329+ if ((vma->vm_flags & (VM_NORESERVE|VM_SHARED|VM_WRITE)) == VM_WRITE) {330330+ dev_info(dev, "%s: %s: fail, attempted private mapping\n",331331+ current->comm, func);332332+ return -EINVAL;333333+ }334334+335335+ mask = dax_region->align - 1;336336+ if (vma->vm_start & mask || vma->vm_end & mask) {337337+ dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",338338+ current->comm, func, vma->vm_start, vma->vm_end,339339+ mask);340340+ return -EINVAL;341341+ }342342+343343+ if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV344344+ && (vma->vm_flags & VM_DONTCOPY) == 0) {345345+ dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n",346346+ current->comm, func);347347+ return -EINVAL;348348+ }349349+350350+ if (!vma_is_dax(vma)) {351351+ dev_info(dev, "%s: %s: fail, vma is not DAX capable\n",352352+ current->comm, func);353353+ return -EINVAL;354354+ }355355+356356+ return 0;357357+}358358+359359+static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,360360+ unsigned long size)361361+{362362+ struct resource *res;363363+ phys_addr_t phys;364364+ int i;365365+366366+ for (i = 0; i < dax_dev->num_resources; i++) {367367+ res = &dax_dev->res[i];368368+ phys = pgoff * PAGE_SIZE + res->start;369369+ if (phys >= res->start && phys <= res->end)370370+ break;371371+ pgoff -= PHYS_PFN(resource_size(res));372372+ }373373+374374+ if (i < dax_dev->num_resources) {375375+ res = &dax_dev->res[i];376376+ if (phys + size - 1 <= res->end)377377+ return phys;378378+ }379379+380380+ return -1;381381+}382382+383383+static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,384384+ struct vm_fault *vmf)385385+{386386+ unsigned long vaddr = (unsigned long) vmf->virtual_address;387387+ struct device *dev = dax_dev->dev;388388+ struct dax_region *dax_region;389389+ int rc = VM_FAULT_SIGBUS;390390+ phys_addr_t phys;391391+ pfn_t pfn;392392+393393+ if (check_vma(dax_dev, vma, __func__))394394+ return VM_FAULT_SIGBUS;395395+396396+ dax_region = dax_dev->region;397397+ if (dax_region->align > PAGE_SIZE) {398398+ dev_dbg(dev, "%s: alignment > fault size\n", __func__);399399+ return VM_FAULT_SIGBUS;400400+ }401401+402402+ phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE);403403+ if (phys == -1) {404404+ dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,405405+ vmf->pgoff);406406+ return VM_FAULT_SIGBUS;407407+ }408408+409409+ pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);410410+411411+ rc = vm_insert_mixed(vma, vaddr, pfn);412412+413413+ if (rc == -ENOMEM)414414+ return VM_FAULT_OOM;415415+ if (rc < 0 && rc != -EBUSY)416416+ return VM_FAULT_SIGBUS;417417+418418+ return VM_FAULT_NOPAGE;419419+}420420+421421+static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)422422+{423423+ int rc;424424+ struct file *filp = vma->vm_file;425425+ struct dax_dev *dax_dev = filp->private_data;426426+427427+ dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,428428+ current->comm, (vmf->flags & FAULT_FLAG_WRITE)429429+ ? "write" : "read", vma->vm_start, vma->vm_end);430430+ rcu_read_lock();431431+ rc = __dax_dev_fault(dax_dev, vma, vmf);432432+ rcu_read_unlock();433433+434434+ return rc;435435+}436436+437437+static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,438438+ struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd,439439+ unsigned int flags)440440+{441441+ unsigned long pmd_addr = addr & PMD_MASK;442442+ struct device *dev = dax_dev->dev;443443+ struct dax_region *dax_region;444444+ phys_addr_t phys;445445+ pgoff_t pgoff;446446+ pfn_t pfn;447447+448448+ if (check_vma(dax_dev, vma, __func__))449449+ return VM_FAULT_SIGBUS;450450+451451+ dax_region = dax_dev->region;452452+ if (dax_region->align > PMD_SIZE) {453453+ dev_dbg(dev, "%s: alignment > fault size\n", __func__);454454+ return VM_FAULT_SIGBUS;455455+ }456456+457457+ /* dax pmd mappings require pfn_t_devmap() */458458+ if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {459459+ dev_dbg(dev, "%s: alignment > fault size\n", __func__);460460+ return VM_FAULT_SIGBUS;461461+ }462462+463463+ pgoff = linear_page_index(vma, pmd_addr);464464+ phys = pgoff_to_phys(dax_dev, pgoff, PAGE_SIZE);465465+ if (phys == -1) {466466+ dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,467467+ pgoff);468468+ return VM_FAULT_SIGBUS;469469+ }470470+471471+ pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);472472+473473+ return vmf_insert_pfn_pmd(vma, addr, pmd, pfn,474474+ flags & FAULT_FLAG_WRITE);475475+}476476+477477+static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr,478478+ pmd_t *pmd, unsigned int flags)479479+{480480+ int rc;481481+ struct file *filp = vma->vm_file;482482+ struct dax_dev *dax_dev = filp->private_data;483483+484484+ dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,485485+ current->comm, (flags & FAULT_FLAG_WRITE)486486+ ? "write" : "read", vma->vm_start, vma->vm_end);487487+488488+ rcu_read_lock();489489+ rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags);490490+ rcu_read_unlock();491491+492492+ return rc;493493+}494494+495495+static void dax_dev_vm_open(struct vm_area_struct *vma)496496+{497497+ struct file *filp = vma->vm_file;498498+ struct dax_dev *dax_dev = filp->private_data;499499+500500+ dev_dbg(dax_dev->dev, "%s\n", __func__);501501+ kref_get(&dax_dev->kref);502502+}503503+504504+static void dax_dev_vm_close(struct vm_area_struct *vma)505505+{506506+ struct file *filp = vma->vm_file;507507+ struct dax_dev *dax_dev = filp->private_data;508508+509509+ dev_dbg(dax_dev->dev, "%s\n", __func__);510510+ dax_dev_put(dax_dev);511511+}512512+513513+static const struct vm_operations_struct dax_dev_vm_ops = {514514+ .fault = dax_dev_fault,515515+ .pmd_fault = dax_dev_pmd_fault,516516+ .open = dax_dev_vm_open,517517+ .close = dax_dev_vm_close,518518+};519519+520520+static int dax_dev_mmap(struct file *filp, struct vm_area_struct *vma)521521+{522522+ struct dax_dev *dax_dev = filp->private_data;523523+ int rc;524524+525525+ dev_dbg(dax_dev->dev, "%s\n", __func__);526526+527527+ rc = check_vma(dax_dev, vma, __func__);528528+ if (rc)529529+ return rc;530530+531531+ kref_get(&dax_dev->kref);532532+ vma->vm_ops = &dax_dev_vm_ops;533533+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;534534+ return 0;535535+536536+}537537+538538+static const struct file_operations dax_fops = {539539+ .llseek = noop_llseek,540540+ .owner = THIS_MODULE,541541+ .open = dax_dev_open,542542+ .release = dax_dev_release,543543+ .get_unmapped_area = dax_dev_get_unmapped_area,544544+ .mmap = dax_dev_mmap,545545+};546546+547547+static int __init dax_init(void)548548+{549549+ int rc;550550+551551+ rc = register_chrdev(0, "dax", &dax_fops);552552+ if (rc < 0)553553+ return rc;554554+ dax_major = rc;555555+556556+ dax_class = class_create(THIS_MODULE, "dax");557557+ if (IS_ERR(dax_class)) {558558+ unregister_chrdev(dax_major, "dax");559559+ return PTR_ERR(dax_class);560560+ }561561+562562+ return 0;563563+}564564+565565+static void __exit dax_exit(void)566566+{567567+ class_destroy(dax_class);568568+ unregister_chrdev(dax_major, "dax");569569+ ida_destroy(&dax_minor_ida);570570+}571571+572572+MODULE_AUTHOR("Intel Corporation");573573+MODULE_LICENSE("GPL v2");574574+subsys_initcall(dax_init);575575+module_exit(dax_exit);
+24
drivers/dax/dax.h
···11+/*22+ * Copyright(c) 2016 Intel Corporation. All rights reserved.33+ *44+ * This program is free software; you can redistribute it and/or modify55+ * it under the terms of version 2 of the GNU General Public License as66+ * published by the Free Software Foundation.77+ *88+ * This program is distributed in the hope that it will be useful, but99+ * WITHOUT ANY WARRANTY; without even the implied warranty of1010+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU1111+ * General Public License for more details.1212+ */1313+#ifndef __DAX_H__1414+#define __DAX_H__1515+struct device;1616+struct resource;1717+struct dax_region;1818+void dax_region_put(struct dax_region *dax_region);1919+struct dax_region *alloc_dax_region(struct device *parent,2020+ int region_id, struct resource *res, unsigned int align,2121+ void *addr, unsigned long flags);2222+int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,2323+ int count);2424+#endif /* __DAX_H__ */
+158
drivers/dax/pmem.c
···11+/*22+ * Copyright(c) 2016 Intel Corporation. All rights reserved.33+ *44+ * This program is free software; you can redistribute it and/or modify55+ * it under the terms of version 2 of the GNU General Public License as66+ * published by the Free Software Foundation.77+ *88+ * This program is distributed in the hope that it will be useful, but99+ * WITHOUT ANY WARRANTY; without even the implied warranty of1010+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU1111+ * General Public License for more details.1212+ */1313+#include <linux/percpu-refcount.h>1414+#include <linux/memremap.h>1515+#include <linux/module.h>1616+#include <linux/pfn_t.h>1717+#include "../nvdimm/pfn.h"1818+#include "../nvdimm/nd.h"1919+#include "dax.h"2020+2121+struct dax_pmem {2222+ struct device *dev;2323+ struct percpu_ref ref;2424+ struct completion cmp;2525+};2626+2727+struct dax_pmem *to_dax_pmem(struct percpu_ref *ref)2828+{2929+ return container_of(ref, struct dax_pmem, ref);3030+}3131+3232+static void dax_pmem_percpu_release(struct percpu_ref *ref)3333+{3434+ struct dax_pmem *dax_pmem = to_dax_pmem(ref);3535+3636+ dev_dbg(dax_pmem->dev, "%s\n", __func__);3737+ complete(&dax_pmem->cmp);3838+}3939+4040+static void dax_pmem_percpu_exit(void *data)4141+{4242+ struct percpu_ref *ref = data;4343+ struct dax_pmem *dax_pmem = to_dax_pmem(ref);4444+4545+ dev_dbg(dax_pmem->dev, "%s\n", __func__);4646+ percpu_ref_exit(ref);4747+ wait_for_completion(&dax_pmem->cmp);4848+}4949+5050+static void dax_pmem_percpu_kill(void *data)5151+{5252+ struct percpu_ref *ref = data;5353+ struct dax_pmem *dax_pmem = to_dax_pmem(ref);5454+5555+ dev_dbg(dax_pmem->dev, "%s\n", __func__);5656+ percpu_ref_kill(ref);5757+}5858+5959+static int dax_pmem_probe(struct device *dev)6060+{6161+ int rc;6262+ void *addr;6363+ struct resource res;6464+ struct nd_pfn_sb *pfn_sb;6565+ struct dax_pmem *dax_pmem;6666+ struct nd_region *nd_region;6767+ struct nd_namespace_io *nsio;6868+ struct dax_region *dax_region;6969+ struct nd_namespace_common *ndns;7070+ struct nd_dax *nd_dax = to_nd_dax(dev);7171+ struct nd_pfn *nd_pfn = &nd_dax->nd_pfn;7272+ struct vmem_altmap __altmap, *altmap = NULL;7373+7474+ ndns = nvdimm_namespace_common_probe(dev);7575+ if (IS_ERR(ndns))7676+ return PTR_ERR(ndns);7777+ nsio = to_nd_namespace_io(&ndns->dev);7878+7979+ /* parse the 'pfn' info block via ->rw_bytes */8080+ devm_nsio_enable(dev, nsio);8181+ altmap = nvdimm_setup_pfn(nd_pfn, &res, &__altmap);8282+ if (IS_ERR(altmap))8383+ return PTR_ERR(altmap);8484+ devm_nsio_disable(dev, nsio);8585+8686+ pfn_sb = nd_pfn->pfn_sb;8787+8888+ if (!devm_request_mem_region(dev, nsio->res.start,8989+ resource_size(&nsio->res), dev_name(dev))) {9090+ dev_warn(dev, "could not reserve region %pR\n", &nsio->res);9191+ return -EBUSY;9292+ }9393+9494+ dax_pmem = devm_kzalloc(dev, sizeof(*dax_pmem), GFP_KERNEL);9595+ if (!dax_pmem)9696+ return -ENOMEM;9797+9898+ dax_pmem->dev = dev;9999+ init_completion(&dax_pmem->cmp);100100+ rc = percpu_ref_init(&dax_pmem->ref, dax_pmem_percpu_release, 0,101101+ GFP_KERNEL);102102+ if (rc)103103+ return rc;104104+105105+ rc = devm_add_action(dev, dax_pmem_percpu_exit, &dax_pmem->ref);106106+ if (rc) {107107+ dax_pmem_percpu_exit(&dax_pmem->ref);108108+ return rc;109109+ }110110+111111+ addr = devm_memremap_pages(dev, &res, &dax_pmem->ref, altmap);112112+ if (IS_ERR(addr))113113+ return PTR_ERR(addr);114114+115115+ rc = devm_add_action(dev, dax_pmem_percpu_kill, &dax_pmem->ref);116116+ if (rc) {117117+ dax_pmem_percpu_kill(&dax_pmem->ref);118118+ return rc;119119+ }120120+121121+ nd_region = to_nd_region(dev->parent);122122+ dax_region = alloc_dax_region(dev, nd_region->id, &res,123123+ le32_to_cpu(pfn_sb->align), addr, PFN_DEV|PFN_MAP);124124+ if (!dax_region)125125+ return -ENOMEM;126126+127127+ /* TODO: support for subdividing a dax region... */128128+ rc = devm_create_dax_dev(dax_region, &res, 1);129129+130130+ /* child dax_dev instances now own the lifetime of the dax_region */131131+ dax_region_put(dax_region);132132+133133+ return rc;134134+}135135+136136+static struct nd_device_driver dax_pmem_driver = {137137+ .probe = dax_pmem_probe,138138+ .drv = {139139+ .name = "dax_pmem",140140+ },141141+ .type = ND_DRIVER_DAX_PMEM,142142+};143143+144144+static int __init dax_pmem_init(void)145145+{146146+ return nd_driver_register(&dax_pmem_driver);147147+}148148+module_init(dax_pmem_init);149149+150150+static void __exit dax_pmem_exit(void)151151+{152152+ driver_unregister(&dax_pmem_driver.drv);153153+}154154+module_exit(dax_pmem_exit);155155+156156+MODULE_LICENSE("GPL v2");157157+MODULE_AUTHOR("Intel Corporation");158158+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DAX_PMEM);
+5-4
drivers/nvdimm/bus.c
···124124 struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver);125125 struct module *provider = to_bus_provider(dev);126126 struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);127127- int rc;127127+ int rc = 0;128128129129- rc = nd_drv->remove(dev);129129+ if (nd_drv->remove)130130+ rc = nd_drv->remove(dev);130131 nd_region_disable(nvdimm_bus, dev);131132132133 dev_dbg(&nvdimm_bus->dev, "%s.remove(%s) = %d\n", dev->driver->name,···297296 return -EINVAL;298297 }299298300300- if (!nd_drv->probe || !nd_drv->remove) {301301- pr_debug("->probe() and ->remove() must be specified\n");299299+ if (!nd_drv->probe) {300300+ pr_debug("%s ->probe() must be specified\n", mod_name);302301 return -EINVAL;303302 }304303
+21-2
drivers/nvdimm/claim.c
···9393 return true;9494}95959696+struct nd_pfn *to_nd_pfn_safe(struct device *dev)9797+{9898+ /*9999+ * pfn device attributes are re-used by dax device instances, so we100100+ * need to be careful to correct device-to-nd_pfn conversion.101101+ */102102+ if (is_nd_pfn(dev))103103+ return to_nd_pfn(dev);104104+105105+ if (is_nd_dax(dev)) {106106+ struct nd_dax *nd_dax = to_nd_dax(dev);107107+108108+ return &nd_dax->nd_pfn;109109+ }110110+111111+ WARN_ON(1);112112+ return NULL;113113+}114114+96115static void nd_detach_and_reset(struct device *dev,97116 struct nd_namespace_common **_ndns)98117{···125106 nd_btt->lbasize = 0;126107 kfree(nd_btt->uuid);127108 nd_btt->uuid = NULL;128128- } else if (is_nd_pfn(dev)) {129129- struct nd_pfn *nd_pfn = to_nd_pfn(dev);109109+ } else if (is_nd_pfn(dev) || is_nd_dax(dev)) {110110+ struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);130111131112 kfree(nd_pfn->uuid);132113 nd_pfn->uuid = NULL;
···320320 return pmem_attach_disk(dev, ndns);321321322322 /* if we find a valid info-block we'll come back as that personality */323323- if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0)323323+ if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0324324+ || nd_dax_probe(dev, ndns) == 0)324325 return -ENXIO;325326326327 /* ...otherwise we're just a raw pmem device */
···2929#include <linux/log2.h>3030#include <linux/cleancache.h>3131#include <linux/dax.h>3232+#include <linux/badblocks.h>3233#include <asm/uaccess.h>3334#include "internal.h"3435···11601159}11611160EXPORT_SYMBOL(bd_set_size);1162116111621162+static bool blkdev_dax_capable(struct block_device *bdev)11631163+{11641164+ struct gendisk *disk = bdev->bd_disk;11651165+11661166+ if (!disk->fops->direct_access || !IS_ENABLED(CONFIG_FS_DAX))11671167+ return false;11681168+11691169+ /*11701170+ * If the partition is not aligned on a page boundary, we can't11711171+ * do dax I/O to it.11721172+ */11731173+ if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))11741174+ || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))11751175+ return false;11761176+11771177+ /*11781178+ * If the device has known bad blocks, force all I/O through the11791179+ * driver / page cache.11801180+ *11811181+ * TODO: support finer grained dax error handling11821182+ */11831183+ if (disk->bb && disk->bb->count)11841184+ return false;11851185+11861186+ return true;11871187+}11881188+11631189static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);1164119011651191/*···17521724 .is_dirty_writeback = buffer_check_dirty_writeback,17531725};1754172617551755-#ifdef CONFIG_FS_DAX17561756-/*17571757- * In the raw block case we do not need to contend with truncation nor17581758- * unwritten file extents. Without those concerns there is no need for17591759- * additional locking beyond the mmap_sem context that these routines17601760- * are already executing under.17611761- *17621762- * Note, there is no protection if the block device is dynamically17631763- * resized (partition grow/shrink) during a fault. A stable block device17641764- * size is already not enforced in the blkdev_direct_IO path.17651765- *17661766- * For DAX, it is the responsibility of the block device driver to17671767- * ensure the whole-disk device size is stable while requests are in17681768- * flight.17691769- *17701770- * Finally, unlike the filemap_page_mkwrite() case there is no17711771- * filesystem superblock to sync against freezing. We still include a17721772- * pfn_mkwrite callback for dax drivers to receive write fault17731773- * notifications.17741774- */17751775-static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)17761776-{17771777- return __dax_fault(vma, vmf, blkdev_get_block, NULL);17781778-}17791779-17801780-static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,17811781- struct vm_fault *vmf)17821782-{17831783- return dax_pfn_mkwrite(vma, vmf);17841784-}17851785-17861786-static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,17871787- pmd_t *pmd, unsigned int flags)17881788-{17891789- return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);17901790-}17911791-17921792-static const struct vm_operations_struct blkdev_dax_vm_ops = {17931793- .fault = blkdev_dax_fault,17941794- .pmd_fault = blkdev_dax_pmd_fault,17951795- .pfn_mkwrite = blkdev_dax_pfn_mkwrite,17961796-};17971797-17981798-static const struct vm_operations_struct blkdev_default_vm_ops = {17991799- .fault = filemap_fault,18001800- .map_pages = filemap_map_pages,18011801-};18021802-18031803-static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)18041804-{18051805- struct inode *bd_inode = bdev_file_inode(file);18061806-18071807- file_accessed(file);18081808- if (IS_DAX(bd_inode)) {18091809- vma->vm_ops = &blkdev_dax_vm_ops;18101810- vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;18111811- } else {18121812- vma->vm_ops = &blkdev_default_vm_ops;18131813- }18141814-18151815- return 0;18161816-}18171817-#else18181818-#define blkdev_mmap generic_file_mmap18191819-#endif18201820-18211727const struct file_operations def_blk_fops = {18221728 .open = blkdev_open,18231729 .release = blkdev_close,18241730 .llseek = block_llseek,18251731 .read_iter = blkdev_read_iter,18261732 .write_iter = blkdev_write_iter,18271827- .mmap = blkdev_mmap,17331733+ .mmap = generic_file_mmap,18281734 .fsync = blkdev_fsync,18291735 .unlocked_ioctl = block_ioctl,18301736#ifdef CONFIG_COMPAT
···624624{625625 return vma_hugecache_offset(hstate_vma(vma), vma, address);626626}627627+EXPORT_SYMBOL_GPL(linear_hugepage_index);627628628629/*629630 * Return the size of the pages allocated when backing a VMA. In the majority