1
0
Fork 0

mm/hmm: use device driver encoding for HMM pfn

Users of hmm_vma_fault() and hmm_vma_get_pfns() provide a flags array and
pfn shift value allowing them to define their own encoding for HMM pfn
that are fill inside the pfns array of the hmm_range struct.  With this
device driver can get pfn that match their own private encoding out of HMM
without having to do any conversion.

[rcampbell@nvidia.com: don't ignore specific pte fault flag in hmm_vma_fault()]
  Link: http://lkml.kernel.org/r/20180326213009.2460-2-jglisse@redhat.com
[rcampbell@nvidia.com: clarify fault logic for device private memory]
  Link: http://lkml.kernel.org/r/20180326213009.2460-3-jglisse@redhat.com
Link: http://lkml.kernel.org/r/20180323005527.758-16-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Ralph Campbell <rcampbell@nvidia.com>
Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mark Hairgrove <mhairgrove@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
hifive-unleashed-5.1
Jérôme Glisse 2018-04-10 16:29:06 -07:00 committed by Linus Torvalds
parent 2aee09d8c1
commit f88a1e90c6
2 changed files with 152 additions and 77 deletions

View File

@ -80,68 +80,145 @@
struct hmm; struct hmm;
/* /*
* hmm_pfn_flag_e - HMM flag enums
*
* Flags: * Flags:
* HMM_PFN_VALID: pfn is valid. It has, at least, read permission. * HMM_PFN_VALID: pfn is valid. It has, at least, read permission.
* HMM_PFN_WRITE: CPU page table has write permission set * HMM_PFN_WRITE: CPU page table has write permission set
* HMM_PFN_DEVICE_PRIVATE: private device memory (ZONE_DEVICE)
*
* The driver provide a flags array, if driver valid bit for an entry is bit
* 3 ie (entry & (1 << 3)) is true if entry is valid then driver must provide
* an array in hmm_range.flags with hmm_range.flags[HMM_PFN_VALID] == 1 << 3.
* Same logic apply to all flags. This is same idea as vm_page_prot in vma
* except that this is per device driver rather than per architecture.
*/
enum hmm_pfn_flag_e {
HMM_PFN_VALID = 0,
HMM_PFN_WRITE,
HMM_PFN_DEVICE_PRIVATE,
HMM_PFN_FLAG_MAX
};
/*
* hmm_pfn_value_e - HMM pfn special value
*
* Flags:
* HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory
* HMM_PFN_NONE: corresponding CPU page table entry is pte_none()
* HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the
* result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not * result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not
* be mirrored by a device, because the entry will never have HMM_PFN_VALID * be mirrored by a device, because the entry will never have HMM_PFN_VALID
* set and the pfn value is undefined. * set and the pfn value is undefined.
* HMM_PFN_DEVICE_PRIVATE: unaddressable device memory (ZONE_DEVICE) *
* Driver provide entry value for none entry, error entry and special entry,
* driver can alias (ie use same value for error and special for instance). It
* should not alias none and error or special.
*
* HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be:
* hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous,
* hmm_range.values[HMM_PFN_NONE] if there is no CPU page table
* hmm_range.values[HMM_PFN_SPECIAL] if CPU page table entry is a special one
*/ */
#define HMM_PFN_VALID (1 << 0) enum hmm_pfn_value_e {
#define HMM_PFN_WRITE (1 << 1) HMM_PFN_ERROR,
#define HMM_PFN_ERROR (1 << 2) HMM_PFN_NONE,
#define HMM_PFN_SPECIAL (1 << 3) HMM_PFN_SPECIAL,
#define HMM_PFN_DEVICE_PRIVATE (1 << 4) HMM_PFN_VALUE_MAX
#define HMM_PFN_SHIFT 5 };
/*
* struct hmm_range - track invalidation lock on virtual address range
*
* @vma: the vm area struct for the range
* @list: all range lock are on a list
* @start: range virtual start address (inclusive)
* @end: range virtual end address (exclusive)
* @pfns: array of pfns (big enough for the range)
* @flags: pfn flags to match device driver page table
* @values: pfn value for some special case (none, special, error, ...)
* @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
* @valid: pfns array did not change since it has been fill by an HMM function
*/
struct hmm_range {
struct vm_area_struct *vma;
struct list_head list;
unsigned long start;
unsigned long end;
uint64_t *pfns;
const uint64_t *flags;
const uint64_t *values;
uint8_t pfn_shift;
bool valid;
};
/* /*
* hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn
* @range: range use to decode HMM pfn value
* @pfn: HMM pfn value to get corresponding struct page from * @pfn: HMM pfn value to get corresponding struct page from
* Returns: struct page pointer if pfn is a valid HMM pfn, NULL otherwise * Returns: struct page pointer if pfn is a valid HMM pfn, NULL otherwise
* *
* If the HMM pfn is valid (ie valid flag set) then return the struct page * If the HMM pfn is valid (ie valid flag set) then return the struct page
* matching the pfn value stored in the HMM pfn. Otherwise return NULL. * matching the pfn value stored in the HMM pfn. Otherwise return NULL.
*/ */
static inline struct page *hmm_pfn_to_page(uint64_t pfn) static inline struct page *hmm_pfn_to_page(const struct hmm_range *range,
uint64_t pfn)
{ {
if (!(pfn & HMM_PFN_VALID)) if (pfn == range->values[HMM_PFN_NONE])
return NULL; return NULL;
return pfn_to_page(pfn >> HMM_PFN_SHIFT); if (pfn == range->values[HMM_PFN_ERROR])
return NULL;
if (pfn == range->values[HMM_PFN_SPECIAL])
return NULL;
if (!(pfn & range->flags[HMM_PFN_VALID]))
return NULL;
return pfn_to_page(pfn >> range->pfn_shift);
} }
/* /*
* hmm_pfn_to_pfn() - return pfn value store in a HMM pfn * hmm_pfn_to_pfn() - return pfn value store in a HMM pfn
* @range: range use to decode HMM pfn value
* @pfn: HMM pfn value to extract pfn from * @pfn: HMM pfn value to extract pfn from
* Returns: pfn value if HMM pfn is valid, -1UL otherwise * Returns: pfn value if HMM pfn is valid, -1UL otherwise
*/ */
static inline unsigned long hmm_pfn_to_pfn(uint64_t pfn) static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range,
uint64_t pfn)
{ {
if (!(pfn & HMM_PFN_VALID)) if (pfn == range->values[HMM_PFN_NONE])
return -1UL; return -1UL;
return (pfn >> HMM_PFN_SHIFT); if (pfn == range->values[HMM_PFN_ERROR])
return -1UL;
if (pfn == range->values[HMM_PFN_SPECIAL])
return -1UL;
if (!(pfn & range->flags[HMM_PFN_VALID]))
return -1UL;
return (pfn >> range->pfn_shift);
} }
/* /*
* hmm_pfn_from_page() - create a valid HMM pfn value from struct page * hmm_pfn_from_page() - create a valid HMM pfn value from struct page
* @range: range use to encode HMM pfn value
* @page: struct page pointer for which to create the HMM pfn * @page: struct page pointer for which to create the HMM pfn
* Returns: valid HMM pfn for the page * Returns: valid HMM pfn for the page
*/ */
static inline uint64_t hmm_pfn_from_page(struct page *page) static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range,
struct page *page)
{ {
return (page_to_pfn(page) << HMM_PFN_SHIFT) | HMM_PFN_VALID; return (page_to_pfn(page) << range->pfn_shift) |
range->flags[HMM_PFN_VALID];
} }
/* /*
* hmm_pfn_from_pfn() - create a valid HMM pfn value from pfn * hmm_pfn_from_pfn() - create a valid HMM pfn value from pfn
* @range: range use to encode HMM pfn value
* @pfn: pfn value for which to create the HMM pfn * @pfn: pfn value for which to create the HMM pfn
* Returns: valid HMM pfn for the pfn * Returns: valid HMM pfn for the pfn
*/ */
static inline uint64_t hmm_pfn_from_pfn(unsigned long pfn) static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
unsigned long pfn)
{ {
return (pfn << HMM_PFN_SHIFT) | HMM_PFN_VALID; return (pfn << range->pfn_shift) |
range->flags[HMM_PFN_VALID];
} }
@ -263,25 +340,6 @@ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
void hmm_mirror_unregister(struct hmm_mirror *mirror); void hmm_mirror_unregister(struct hmm_mirror *mirror);
/*
* struct hmm_range - track invalidation lock on virtual address range
*
* @vma: the vm area struct for the range
* @list: all range lock are on a list
* @start: range virtual start address (inclusive)
* @end: range virtual end address (exclusive)
* @pfns: array of pfns (big enough for the range)
* @valid: pfns array did not change since it has been fill by an HMM function
*/
struct hmm_range {
struct vm_area_struct *vma;
struct list_head list;
unsigned long start;
unsigned long end;
uint64_t *pfns;
bool valid;
};
/* /*
* To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device
* driver lock that serializes device page table updates, then call * driver lock that serializes device page table updates, then call

View File

@ -306,6 +306,7 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
{ {
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma; struct vm_area_struct *vma = walk->vma;
int r; int r;
@ -315,7 +316,7 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
if (r & VM_FAULT_RETRY) if (r & VM_FAULT_RETRY)
return -EBUSY; return -EBUSY;
if (r & VM_FAULT_ERROR) { if (r & VM_FAULT_ERROR) {
*pfn = HMM_PFN_ERROR; *pfn = range->values[HMM_PFN_ERROR];
return -EFAULT; return -EFAULT;
} }
@ -333,7 +334,7 @@ static int hmm_pfns_bad(unsigned long addr,
i = (addr - range->start) >> PAGE_SHIFT; i = (addr - range->start) >> PAGE_SHIFT;
for (; addr < end; addr += PAGE_SIZE, i++) for (; addr < end; addr += PAGE_SIZE, i++)
pfns[i] = HMM_PFN_ERROR; pfns[i] = range->values[HMM_PFN_ERROR];
return 0; return 0;
} }
@ -362,7 +363,7 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
hmm_vma_walk->last = addr; hmm_vma_walk->last = addr;
i = (addr - range->start) >> PAGE_SHIFT; i = (addr - range->start) >> PAGE_SHIFT;
for (; addr < end; addr += PAGE_SIZE, i++) { for (; addr < end; addr += PAGE_SIZE, i++) {
pfns[i] = 0; pfns[i] = range->values[HMM_PFN_NONE];
if (fault || write_fault) { if (fault || write_fault) {
int ret; int ret;
@ -380,24 +381,31 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
uint64_t pfns, uint64_t cpu_flags, uint64_t pfns, uint64_t cpu_flags,
bool *fault, bool *write_fault) bool *fault, bool *write_fault)
{ {
struct hmm_range *range = hmm_vma_walk->range;
*fault = *write_fault = false; *fault = *write_fault = false;
if (!hmm_vma_walk->fault) if (!hmm_vma_walk->fault)
return; return;
/* We aren't ask to do anything ... */ /* We aren't ask to do anything ... */
if (!(pfns & HMM_PFN_VALID)) if (!(pfns & range->flags[HMM_PFN_VALID]))
return; return;
/* If CPU page table is not valid then we need to fault */ /* If this is device memory than only fault if explicitly requested */
*fault = cpu_flags & HMM_PFN_VALID; if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
/* Need to write fault ? */ /* Do we fault on device memory ? */
if ((pfns & HMM_PFN_WRITE) && !(cpu_flags & HMM_PFN_WRITE)) { if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
*fault = *write_fault = false; *write_fault = pfns & range->flags[HMM_PFN_WRITE];
*fault = true;
}
return; return;
} }
/* Do we fault on device memory ? */
if ((pfns & HMM_PFN_DEVICE_PRIVATE) && /* If CPU page table is not valid then we need to fault */
(cpu_flags & HMM_PFN_DEVICE_PRIVATE)) { *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]);
*write_fault = pfns & HMM_PFN_WRITE; /* Need to write fault ? */
if ((pfns & range->flags[HMM_PFN_WRITE]) &&
!(cpu_flags & range->flags[HMM_PFN_WRITE])) {
*write_fault = true;
*fault = true; *fault = true;
} }
} }
@ -439,13 +447,13 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
} }
static inline uint64_t pmd_to_hmm_pfn_flags(pmd_t pmd) static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
{ {
if (pmd_protnone(pmd)) if (pmd_protnone(pmd))
return 0; return 0;
return pmd_write(pmd) ? HMM_PFN_VALID | return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] |
HMM_PFN_WRITE : range->flags[HMM_PFN_WRITE] :
HMM_PFN_VALID; range->flags[HMM_PFN_VALID];
} }
static int hmm_vma_handle_pmd(struct mm_walk *walk, static int hmm_vma_handle_pmd(struct mm_walk *walk,
@ -455,12 +463,13 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
pmd_t pmd) pmd_t pmd)
{ {
struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
unsigned long pfn, npages, i; unsigned long pfn, npages, i;
uint64_t flag = 0, cpu_flags;
bool fault, write_fault; bool fault, write_fault;
uint64_t cpu_flags;
npages = (end - addr) >> PAGE_SHIFT; npages = (end - addr) >> PAGE_SHIFT;
cpu_flags = pmd_to_hmm_pfn_flags(pmd); cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags,
&fault, &write_fault); &fault, &write_fault);
@ -468,20 +477,19 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
pfn = pmd_pfn(pmd) + pte_index(addr); pfn = pmd_pfn(pmd) + pte_index(addr);
flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0;
for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
pfns[i] = hmm_pfn_from_pfn(pfn) | flag; pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
hmm_vma_walk->last = end; hmm_vma_walk->last = end;
return 0; return 0;
} }
static inline uint64_t pte_to_hmm_pfn_flags(pte_t pte) static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
{ {
if (pte_none(pte) || !pte_present(pte)) if (pte_none(pte) || !pte_present(pte))
return 0; return 0;
return pte_write(pte) ? HMM_PFN_VALID | return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
HMM_PFN_WRITE : range->flags[HMM_PFN_WRITE] :
HMM_PFN_VALID; range->flags[HMM_PFN_VALID];
} }
static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
@ -489,14 +497,16 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
uint64_t *pfn) uint64_t *pfn)
{ {
struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma; struct vm_area_struct *vma = walk->vma;
bool fault, write_fault; bool fault, write_fault;
uint64_t cpu_flags; uint64_t cpu_flags;
pte_t pte = *ptep; pte_t pte = *ptep;
uint64_t orig_pfn = *pfn;
*pfn = 0; *pfn = range->values[HMM_PFN_NONE];
cpu_flags = pte_to_hmm_pfn_flags(pte); cpu_flags = pte_to_hmm_pfn_flags(range, pte);
hmm_pte_need_fault(hmm_vma_walk, *pfn, cpu_flags, hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
&fault, &write_fault); &fault, &write_fault);
if (pte_none(pte)) { if (pte_none(pte)) {
@ -519,11 +529,16 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
* device and report anything else as error. * device and report anything else as error.
*/ */
if (is_device_private_entry(entry)) { if (is_device_private_entry(entry)) {
cpu_flags = HMM_PFN_VALID | HMM_PFN_DEVICE_PRIVATE; cpu_flags = range->flags[HMM_PFN_VALID] |
range->flags[HMM_PFN_DEVICE_PRIVATE];
cpu_flags |= is_write_device_private_entry(entry) ? cpu_flags |= is_write_device_private_entry(entry) ?
HMM_PFN_WRITE : 0; range->flags[HMM_PFN_WRITE] : 0;
*pfn = hmm_pfn_from_pfn(swp_offset(entry)); hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
*pfn |= HMM_PFN_DEVICE_PRIVATE; &fault, &write_fault);
if (fault || write_fault)
goto fault;
*pfn = hmm_pfn_from_pfn(range, swp_offset(entry));
*pfn |= cpu_flags;
return 0; return 0;
} }
@ -539,14 +554,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
} }
/* Report error for everything else */ /* Report error for everything else */
*pfn = HMM_PFN_ERROR; *pfn = range->values[HMM_PFN_ERROR];
return -EFAULT; return -EFAULT;
} }
if (fault || write_fault) if (fault || write_fault)
goto fault; goto fault;
*pfn = hmm_pfn_from_pfn(pte_pfn(pte)) | cpu_flags; *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
return 0; return 0;
fault: fault:
@ -615,12 +630,13 @@ again:
return 0; return 0;
} }
static void hmm_pfns_clear(uint64_t *pfns, static void hmm_pfns_clear(struct hmm_range *range,
uint64_t *pfns,
unsigned long addr, unsigned long addr,
unsigned long end) unsigned long end)
{ {
for (; addr < end; addr += PAGE_SIZE, pfns++) for (; addr < end; addr += PAGE_SIZE, pfns++)
*pfns = 0; *pfns = range->values[HMM_PFN_NONE];
} }
static void hmm_pfns_special(struct hmm_range *range) static void hmm_pfns_special(struct hmm_range *range)
@ -628,7 +644,7 @@ static void hmm_pfns_special(struct hmm_range *range)
unsigned long addr = range->start, i = 0; unsigned long addr = range->start, i = 0;
for (; addr < range->end; addr += PAGE_SIZE, i++) for (; addr < range->end; addr += PAGE_SIZE, i++)
range->pfns[i] = HMM_PFN_SPECIAL; range->pfns[i] = range->values[HMM_PFN_SPECIAL];
} }
/* /*
@ -681,7 +697,7 @@ int hmm_vma_get_pfns(struct hmm_range *range)
* write without read access are not supported by HMM, because * write without read access are not supported by HMM, because
* operations such has atomic access would not work. * operations such has atomic access would not work.
*/ */
hmm_pfns_clear(range->pfns, range->start, range->end); hmm_pfns_clear(range, range->pfns, range->start, range->end);
return -EPERM; return -EPERM;
} }
@ -834,7 +850,7 @@ int hmm_vma_fault(struct hmm_range *range, bool block)
hmm = hmm_register(vma->vm_mm); hmm = hmm_register(vma->vm_mm);
if (!hmm) { if (!hmm) {
hmm_pfns_clear(range->pfns, range->start, range->end); hmm_pfns_clear(range, range->pfns, range->start, range->end);
return -ENOMEM; return -ENOMEM;
} }
/* Caller must have registered a mirror using hmm_mirror_register() */ /* Caller must have registered a mirror using hmm_mirror_register() */
@ -854,7 +870,7 @@ int hmm_vma_fault(struct hmm_range *range, bool block)
* write without read access are not supported by HMM, because * write without read access are not supported by HMM, because
* operations such has atomic access would not work. * operations such has atomic access would not work.
*/ */
hmm_pfns_clear(range->pfns, range->start, range->end); hmm_pfns_clear(range, range->pfns, range->start, range->end);
return -EPERM; return -EPERM;
} }
@ -887,7 +903,8 @@ int hmm_vma_fault(struct hmm_range *range, bool block)
unsigned long i; unsigned long i;
i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
hmm_pfns_clear(&range->pfns[i], hmm_vma_walk.last, range->end); hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last,
range->end);
hmm_vma_range_done(range); hmm_vma_range_done(range);
} }
return ret; return ret;