Skip to content
Snippets Groups Projects
  1. Mar 22, 2023
    • James Houghton's avatar
      mm: teach mincore_hugetlb about pte markers · ffc08f84
      James Houghton authored
      commit 63cf5842 upstream.
      
      By checking huge_pte_none(), we incorrectly classify PTE markers as
      "present".  Instead, check huge_pte_none_mostly(), classifying PTE markers
      the same as if the PTE were completely blank.
      
      PTE markers, unlike other kinds of swap entries, don't reference any
      physical page and don't indicate that a physical page was mapped
      previously.  As such, treat them as non-present for the sake of mincore().
      
      Link: https://lkml.kernel.org/r/20230302222404.175303-1-jthoughton@google.com
      
      
      Fixes: 5c041f5d ("mm: teach core mm about pte markers")
      Signed-off-by: default avatarJames Houghton <jthoughton@google.com>
      Acked-by: default avatarPeter Xu <peterx@redhat.com>
      Acked-by: default avatarDavid Hildenbrand <david@redhat.com>
      Cc: Axel Rasmussen <axelrasmussen@google.com>
      Cc: James Houghton <jthoughton@google.com>
      Cc: <stable@vger.kernel.org>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      ffc08f84
    • David Hildenbrand's avatar
      mm/userfaultfd: propagate uffd-wp bit when PTE-mapping the huge zeropage · 03272d6c
      David Hildenbrand authored
      commit 42b2af2c upstream.
      
      Currently, we'd lose the userfaultfd-wp marker when PTE-mapping a huge
      zeropage, resulting in the next write faults in the PMD range not
      triggering uffd-wp events.
      
      Various actions (partial MADV_DONTNEED, partial mremap, partial munmap,
      partial mprotect) could trigger this.  However, most importantly,
      un-protecting a single sub-page from the userfaultfd-wp handler when
      processing a uffd-wp event will PTE-map the shared huge zeropage and lose
      the uffd-wp bit for the remainder of the PMD.
      
      Let's properly propagate the uffd-wp bit to the PMDs.
      
       #define _GNU_SOURCE
       #include <stdio.h>
       #include <stdlib.h>
       #include <stdint.h>
       #include <stdbool.h>
       #include <inttypes.h>
       #include <fcntl.h>
       #include <unistd.h>
       #include <errno.h>
       #include <poll.h>
       #include <pthread.h>
       #include <sys/mman.h>
       #include <sys/syscall.h>
       #include <sys/ioctl.h>
       #include <linux/userfaultfd.h>
      
       static size_t pagesize;
       static int uffd;
       static volatile bool uffd_triggered;
      
       #define barrier() __asm__ __volatile__("": : :"memory")
      
       static void uffd_wp_range(char *start, size_t size, bool wp)
       {
       	struct uffdio_writeprotect uffd_writeprotect;
      
       	uffd_writeprotect.range.start = (unsigned long) start;
       	uffd_writeprotect.range.len = size;
       	if (wp) {
       		uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP;
       	} else {
       		uffd_writeprotect.mode = 0;
       	}
       	if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
       		fprintf(stderr, "UFFDIO_WRITEPROTECT failed: %d\n", errno);
       		exit(1);
       	}
       }
      
       static void *uffd_thread_fn(void *arg)
       {
       	static struct uffd_msg msg;
       	ssize_t nread;
      
       	while (1) {
       		struct pollfd pollfd;
       		int nready;
      
       		pollfd.fd = uffd;
       		pollfd.events = POLLIN;
       		nready = poll(&pollfd, 1, -1);
       		if (nready == -1) {
       			fprintf(stderr, "poll() failed: %d\n", errno);
       			exit(1);
       		}
      
       		nread = read(uffd, &msg, sizeof(msg));
       		if (nread <= 0)
       			continue;
      
       		if (msg.event != UFFD_EVENT_PAGEFAULT ||
       		    !(msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP)) {
       			printf("FAIL: wrong uffd-wp event fired\n");
       			exit(1);
       		}
      
       		/* un-protect the single page. */
       		uffd_triggered = true;
       		uffd_wp_range((char *)(uintptr_t)msg.arg.pagefault.address,
       			      pagesize, false);
       	}
       	return arg;
       }
      
       static int setup_uffd(char *map, size_t size)
       {
       	struct uffdio_api uffdio_api;
       	struct uffdio_register uffdio_register;
       	pthread_t thread;
      
       	uffd = syscall(__NR_userfaultfd,
       		       O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
       	if (uffd < 0) {
       		fprintf(stderr, "syscall() failed: %d\n", errno);
       		return -errno;
       	}
      
       	uffdio_api.api = UFFD_API;
       	uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
       	if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
       		fprintf(stderr, "UFFDIO_API failed: %d\n", errno);
       		return -errno;
       	}
      
       	if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) {
       		fprintf(stderr, "UFFD_FEATURE_WRITEPROTECT missing\n");
       		return -ENOSYS;
       	}
      
       	uffdio_register.range.start = (unsigned long) map;
       	uffdio_register.range.len = size;
       	uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
       	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) {
       		fprintf(stderr, "UFFDIO_REGISTER failed: %d\n", errno);
       		return -errno;
       	}
      
       	pthread_create(&thread, NULL, uffd_thread_fn, NULL);
      
       	return 0;
       }
      
       int main(void)
       {
       	const size_t size = 4 * 1024 * 1024ull;
       	char *map, *cur;
      
       	pagesize = getpagesize();
      
       	map = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
       	if (map == MAP_FAILED) {
       		fprintf(stderr, "mmap() failed\n");
       		return -errno;
       	}
      
       	if (madvise(map, size, MADV_HUGEPAGE)) {
       		fprintf(stderr, "MADV_HUGEPAGE failed\n");
       		return -errno;
       	}
      
       	if (setup_uffd(map, size))
       		return 1;
      
       	/* Read the whole range, populating zeropages. */
       	madvise(map, size, MADV_POPULATE_READ);
      
       	/* Write-protect the whole range. */
       	uffd_wp_range(map, size, true);
      
       	/* Make sure uffd-wp triggers on each page. */
       	for (cur = map; cur < map + size; cur += pagesize) {
       		uffd_triggered = false;
      
       		barrier();
       		/* Trigger a write fault. */
       		*cur = 1;
       		barrier();
      
       		if (!uffd_triggered) {
       			printf("FAIL: uffd-wp did not trigger\n");
       			return 1;
       		}
       	}
      
       	printf("PASS: uffd-wp triggered\n");
       	return 0;
       }
      
      Link: https://lkml.kernel.org/r/20230302175423.589164-1-david@redhat.com
      
      
      Fixes: e06f1e1d ("userfaultfd: wp: enabled write protection in userfaultfd API")
      Signed-off-by: default avatarDavid Hildenbrand <david@redhat.com>
      Acked-by: default avatarPeter Xu <peterx@redhat.com>
      Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
      Cc: Andrea Arcangeli <aarcange@redhat.com>
      Cc: Jerome Glisse <jglisse@redhat.com>
      Cc: Shaohua Li <shli@fb.com>
      Cc: <stable@vger.kernel.org>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      03272d6c
  2. Mar 10, 2023
    • Tong Tiangen's avatar
      memory tier: release the new_memtier in find_create_memory_tier() · ec9a6d47
      Tong Tiangen authored
      commit 93419139 upstream.
      
      In find_create_memory_tier(), if failed to register device, then we should
      release new_memtier from the tier list and put device instead of memtier.
      
      Link: https://lkml.kernel.org/r/20230129040651.1329208-1-tongtiangen@huawei.com
      
      
      Fixes: 9832fb87 ("mm/demotion: expose memory tier details via sysfs")
      Signed-off-by: default avatarTong Tiangen <tongtiangen@huawei.com>
      Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
      Cc: Hanjun Guo <guohanjun@huawei.com>
      Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
      Cc: Guohanjun <guohanjun@huawei.com>
      Cc: <stable@vger.kernel.org>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      ec9a6d47
    • Yin Fengwei's avatar
      mm/thp: check and bail out if page in deferred queue already · 5a2a2bec
      Yin Fengwei authored
      commit 81e506be upstream.
      
      Kernel build regression with LLVM was reported here:
      https://lore.kernel.org/all/Y1GCYXGtEVZbcv%2F5@dev-arch.thelio-3990X/ with
      commit f35b5d7d ("mm: align larger anonymous mappings on THP
      boundaries").  And the commit f35b5d7d was reverted.
      
      It turned out the regression is related with madvise(MADV_DONTNEED)
      was used by ld.lld. But with none PMD_SIZE aligned parameter len.
      trace-bpfcc captured:
      531607  531732  ld.lld          do_madvise.part.0 start: 0x7feca9000000, len: 0x7fb000, behavior: 0x4
      531607  531793  ld.lld          do_madvise.part.0 start: 0x7fec86a00000, len: 0x7fb000, behavior: 0x4
      
      If the underneath physical page is THP, the madvise(MADV_DONTNEED) can
      trigger split_queue_lock contention raised significantly. perf showed
      following data:
          14.85%     0.00%  ld.lld           [kernel.kallsyms]           [k]
             entry_SYSCALL_64_after_hwframe
                 11.52%
                      entry_SYSCALL_64_after_hwframe
                      do_syscall_64
                      __x64_sys_madvise
                      do_madvise.part.0
                      zap_page_range
                      unmap_single_vma
                      unmap_page_range
                      page_remove_rmap
                      deferred_split_huge_page
                      __lock_text_start
                      native_queued_spin_lock_slowpath
      
      If THP can't be removed from rmap as whole THP, partial THP will be
      removed from rmap by removing sub-pages from rmap.  Even the THP head page
      is added to deferred queue already, the split_queue_lock will be acquired
      and check whether the THP head page is in the queue already.  Thus, the
      contention of split_queue_lock is raised.
      
      Before acquire split_queue_lock, check and bail out early if the THP
      head page is in the queue already. The checking without holding
      split_queue_lock could race with deferred_split_scan, but it doesn't
      impact the correctness here.
      
      Test result of building kernel with ld.lld:
      commit 7b5a0b66 (parent commit of f35b5d7d):
      time -f "\t%E real,\t%U user,\t%S sys" make LD=ld.lld -skj96 allmodconfig all
              6:07.99 real,   26367.77 user,  5063.35 sys
      
      commit f35b5d7d:
      time -f "\t%E real,\t%U user,\t%S sys" make LD=ld.lld -skj96 allmodconfig all
              7:22.15 real,   26235.03 user,  12504.55 sys
      
      commit f35b5d7d with the fixing patch:
      time -f "\t%E real,\t%U user,\t%S sys" make LD=ld.lld -skj96 allmodconfig all
              6:08.49 real,   26520.15 user,  5047.91 sys
      
      Link: https://lkml.kernel.org/r/20221223135207.2275317-1-fengwei.yin@intel.com
      
      
      Signed-off-by: default avatarYin Fengwei <fengwei.yin@intel.com>
      Tested-by: default avatarNathan Chancellor <nathan@kernel.org>
      Acked-by: default avatarDavid Rientjes <rientjes@google.com>
      Reviewed-by: default avatar"Huang, Ying" <ying.huang@intel.com>
      Cc: Feng Tang <feng.tang@intel.com>
      Cc: Matthew Wilcox <willy@infradead.org>
      Cc: Rik van Riel <riel@surriel.com>
      Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com>
      Cc: Yang Shi <shy828301@gmail.com>
      Cc: <stable@vger.kernel.org>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      5a2a2bec
    • Johannes Weiner's avatar
      mm: memcontrol: deprecate charge moving · 70637798
      Johannes Weiner authored
      commit da34a848 upstream.
      
      Charge moving mode in cgroup1 allows memory to follow tasks as they
      migrate between cgroups.  This is, and always has been, a questionable
      thing to do - for several reasons.
      
      First, it's expensive.  Pages need to be identified, locked and isolated
      from various MM operations, and reassigned, one by one.
      
      Second, it's unreliable.  Once pages are charged to a cgroup, there isn't
      always a clear owner task anymore.  Cache isn't moved at all, for example.
      Mapped memory is moved - but if trylocking or isolating a page fails,
      it's arbitrarily left behind.  Frequent moving between domains may leave a
      task's memory scattered all over the place.
      
      Third, it isn't really needed.  Launcher tasks can kick off workload tasks
      directly in their target cgroup.  Using dedicated per-workload groups
      allows fine-grained policy adjustments - no need to move tasks and their
      physical pages between control domains.  The feature was never
      forward-ported to cgroup2, and it hasn't been missed.
      
      Despite it being a niche usecase, the maintenance overhead of supporting
      it is enormous.  Because pages are moved while they are live and subject
      to various MM operations, the synchronization rules are complicated.
      There are lock_page_memcg() in MM and FS code, which non-cgroup people
      don't understand.  In some cases we've been able to shift code and cgroup
      API calls around such that we can rely on native locking as much as
      possible.  But that's fragile, and sometimes we need to hold MM locks for
      longer than we otherwise would (pte lock e.g.).
      
      Mark the feature deprecated. Hopefully we can remove it soon.
      
      And backport into -stable kernels so that people who develop against
      earlier kernels are warned about this deprecation as early as possible.
      
      [akpm@linux-foundation.org: fix memory.rst underlining]
      Link: https://lkml.kernel.org/r/Y5COd+qXwk/S+n8N@cmpxchg.org
      
      
      Signed-off-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
      Acked-by: default avatarShakeel Butt <shakeelb@google.com>
      Acked-by: default avatarHugh Dickins <hughd@google.com>
      Acked-by: default avatarMichal Hocko <mhocko@suse.com>
      Cc: Muchun Song <songmuchun@bytedance.com>
      Cc: Roman Gushchin <roman.gushchin@linux.dev>
      Cc: <stable@vger.kernel.org>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      70637798
    • Naoya Horiguchi's avatar
      mm/hwpoison: convert TTU_IGNORE_HWPOISON to TTU_HWPOISON · 6dcf132f
      Naoya Horiguchi authored
      commit 6da6b1d4 upstream.
      
      After a memory error happens on a clean folio, a process unexpectedly
      receives SIGBUS when it accesses the error page.  This SIGBUS killing is
      pointless and simply degrades the level of RAS of the system, because the
      clean folio can be dropped without any data lost on memory error handling
      as we do for a clean pagecache.
      
      When memory_failure() is called on a clean folio, try_to_unmap() is called
      twice (one from split_huge_page() and one from hwpoison_user_mappings()).
      The root cause of the issue is that pte conversion to hwpoisoned entry is
      now done in the first call of try_to_unmap() because PageHWPoison is
      already set at this point, while it's actually expected to be done in the
      second call.  This behavior disturbs the error handling operation like
      removing pagecache, which results in the malfunction described above.
      
      So convert TTU_IGNORE_HWPOISON into TTU_HWPOISON and set TTU_HWPOISON only
      when we really intend to convert pte to hwpoison entry.  This can prevent
      other callers of try_to_unmap() from accidentally converting to hwpoison
      entries.
      
      Link: https://lkml.kernel.org/r/20230221085905.1465385-1-naoya.horiguchi@linux.dev
      
      
      Fixes: a42634a6 ("readahead: Use a folio in read_pages()")
      Signed-off-by: default avatarNaoya Horiguchi <naoya.horiguchi@nec.com>
      Cc: David Hildenbrand <david@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Matthew Wilcox <willy@infradead.org>
      Cc: Miaohe Lin <linmiaohe@huawei.com>
      Cc: Minchan Kim <minchan@kernel.org>
      Cc: Vlastimil Babka <vbabka@suse.cz>
      Cc: <stable@vger.kernel.org>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      6dcf132f
    • andrew.yang's avatar
      mm/damon/paddr: fix missing folio_put() · dca31ef8
      andrew.yang authored
      commit 3f98c9a6 upstream.
      
      damon_get_folio() would always increase folio _refcount and
      folio_isolate_lru() would increase folio _refcount if the folio's lru flag
      is set.
      
      If an unevictable folio isolated successfully, there will be two more
      _refcount.  The one from folio_isolate_lru() will be decreased in
      folio_puback_lru(), but the other one from damon_get_folio() will be left
      behind.  This causes a pin page.
      
      Whatever the case, the _refcount from damon_get_folio() should be
      decreased.
      
      Link: https://lkml.kernel.org/r/20230222064223.6735-1-andrew.yang@mediatek.com
      
      
      Fixes: 57223ac2 ("mm/damon/paddr: support the pageout scheme")
      Signed-off-by: default avatarandrew.yang <andrew.yang@mediatek.com>
      Reviewed-by: default avatarSeongJae Park <sj@kernel.org>
      Cc: <stable@vger.kernel.org>	[5.16.x]
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarSeongJae Park <sj@kernel.org>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      dca31ef8
    • Ondrej Mosnacek's avatar
      sysctl: fix proc_dobool() usability · ee0c54cf
      Ondrej Mosnacek authored
      
      [ Upstream commit f1aa2eb5 ]
      
      Currently proc_dobool expects a (bool *) in table->data, but sizeof(int)
      in table->maxsize, because it uses do_proc_dointvec() directly.
      
      This is unsafe for at least two reasons:
      1. A sysctl table definition may use { .data = &variable, .maxsize =
         sizeof(variable) }, not realizing that this makes the sysctl unusable
         (see the Fixes: tag) and that they need to use the completely
         counterintuitive sizeof(int) instead.
      2. proc_dobool() will currently try to parse an array of values if given
         .maxsize >= 2*sizeof(int), but will try to write values of type bool
         by offsets of sizeof(int), so it will not work correctly with neither
         an (int *) nor a (bool *). There is no .maxsize validation to prevent
         this.
      
      Fix this by:
      1. Constraining proc_dobool() to allow only one value and .maxsize ==
         sizeof(bool).
      2. Wrapping the original struct ctl_table in a temporary one with .data
         pointing to a local int variable and .maxsize set to sizeof(int) and
         passing this one to proc_dointvec(), converting the value to/from
         bool as needed (using proc_dou8vec_minmax() as an example).
      3. Extending sysctl_check_table() to enforce proc_dobool() expectations.
      4. Fixing the proc_dobool() docstring (it was just copy-pasted from
         proc_douintvec, apparently...).
      5. Converting all existing proc_dobool() users to set .maxsize to
         sizeof(bool) instead of sizeof(int).
      
      Fixes: 83efeeeb ("tty: Allow TIOCSTI to be disabled")
      Fixes: a2071573 ("sysctl: introduce new proc handler proc_dobool")
      Signed-off-by: default avatarOndrej Mosnacek <omosnace@redhat.com>
      Acked-by: default avatarKees Cook <keescook@chromium.org>
      Signed-off-by: default avatarLuis Chamberlain <mcgrof@kernel.org>
      Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
      ee0c54cf
  3. Feb 17, 2023
  4. Feb 12, 2023
    • David Chen's avatar
      Fix page corruption caused by racy check in __free_pages · 462a8e08
      David Chen authored
      When we upgraded our kernel, we started seeing some page corruption like
      the following consistently:
      
        BUG: Bad page state in process ganesha.nfsd  pfn:1304ca
        page:0000000022261c55 refcount:0 mapcount:-128 mapping:0000000000000000 index:0x0 pfn:0x1304ca
        flags: 0x17ffffc0000000()
        raw: 0017ffffc0000000 ffff8a513ffd4c98 ffffeee24b35ec08 0000000000000000
        raw: 0000000000000000 0000000000000001 00000000ffffff7f 0000000000000000
        page dumped because: nonzero mapcount
        CPU: 0 PID: 15567 Comm: ganesha.nfsd Kdump: loaded Tainted: P    B      O      5.10.158-1.nutanix.20221209.el7.x86_64 #1
        Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/05/2016
        Call Trace:
         dump_stack+0x74/0x96
         bad_page.cold+0x63/0x94
         check_new_page_bad+0x6d/0x80
         rmqueue+0x46e/0x970
         get_page_from_freelist+0xcb/0x3f0
         ? _cond_resched+0x19/0x40
         __alloc_pages_nodemask+0x164/0x300
         alloc_pages_current+0x87/0xf0
         skb_page_frag_refill+0x84/0x110
         ...
      
      Sometimes, it would also show up as corruption in the free list pointer
      and cause crashes.
      
      After bisecting the issue, we found the issue started from commit
      e320d301 ("mm/page_alloc.c: fix freeing non-compound pages"):
      
      	if (put_page_testzero(page))
      		free_the_page(page, order);
      	else if (!PageHead(page))
      		while (order-- > 0)
      			free_the_page(page + (1 << order), order);
      
      So the problem is the check PageHead is racy because at this point we
      already dropped our reference to the page.  So even if we came in with
      compound page, the page can already be freed and PageHead can return
      false and we will end up freeing all the tail pages causing double free.
      
      Fixes: e320d301 ("mm/page_alloc.c: fix freeing non-compound pages")
      Link: https://lore.kernel.org/lkml/BYAPR02MB448855960A9656EEA81141FC94D99@BYAPR02MB4488.namprd02.prod.outlook.com/
      
      
      Cc: Andrew Morton <akpm@linux-foundation.org>
      Cc: stable@vger.kernel.org
      Signed-off-by: default avatarChunwei Chen <david.chen@nutanix.com>
      Reviewed-by: default avatarVlastimil Babka <vbabka@suse.cz>
      Reviewed-by: default avatarMatthew Wilcox (Oracle) <willy@infradead.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      462a8e08
  5. Feb 09, 2023
    • Qi Zheng's avatar
      mm: shrinkers: fix deadlock in shrinker debugfs · badc28d4
      Qi Zheng authored
      The debugfs_remove_recursive() is invoked by unregister_shrinker(), which
      is holding the write lock of shrinker_rwsem.  It will waits for the
      handler of debugfs file complete.  The handler also needs to hold the read
      lock of shrinker_rwsem to do something.  So it may cause the following
      deadlock:
      
       	CPU0				CPU1
      
      debugfs_file_get()
      shrinker_debugfs_count_show()/shrinker_debugfs_scan_write()
      
           				unregister_shrinker()
      				--> down_write(&shrinker_rwsem);
      				    debugfs_remove_recursive()
      					// wait for (A)
      				    --> wait_for_completion();
      
          // wait for (B)
      --> down_read_killable(&shrinker_rwsem)
      debugfs_file_put() -- (A)
      
      				    up_write() -- (B)
      
      The down_read_killable() can be killed, so that the above deadlock can be
      recovered.  But it still requires an extra kill action, otherwise it will
      block all subsequent shrinker-related operations, so it's better to fix
      it.
      
      [akpm@linux-foundation.org: fix CONFIG_SHRINKER_DEBUG=n stub]
      Link: https://lkml.kernel.org/r/20230202105612.64641-1-zhengqi.arch@bytedance.com
      
      
      Fixes: 5035ebc6 ("mm: shrinkers: introduce debugfs interface for memory shrinkers")
      Signed-off-by: default avatarQi Zheng <zhengqi.arch@bytedance.com>
      Reviewed-by: default avatarRoman Gushchin <roman.gushchin@linux.dev>
      Cc: Kent Overstreet <kent.overstreet@gmail.com>
      Cc: Muchun Song <songmuchun@bytedance.com>
      Cc: <stable@vger.kernel.org>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      badc28d4
    • Kefeng Wang's avatar
      mm: hwpoison: support recovery from ksm_might_need_to_copy() · 6b970599
      Kefeng Wang authored
      When the kernel copies a page from ksm_might_need_to_copy(), but runs into
      an uncorrectable error, it will crash since poisoned page is consumed by
      kernel, this is similar to the issue recently fixed by Copy-on-write
      poison recovery.
      
      When an error is detected during the page copy, return VM_FAULT_HWPOISON
      in do_swap_page(), and install a hwpoison entry in unuse_pte() when
      swapoff, which help us to avoid system crash.  Note, memory failure on a
      KSM page will be skipped, but still call memory_failure_queue() to be
      consistent with general memory failure process, and we could support KSM
      page recovery in the feature.
      
      [wangkefeng.wang@huawei.com: enhance unuse_pte(), fix issue found by lkp]
        Link: https://lkml.kernel.org/r/20221213120523.141588-1-wangkefeng.wang@huawei.com
      [wangkefeng.wang@huawei.com: update changelog, alter ksm_might_need_to_copy(), restore unlikely() in unuse_pte()]
        Link: https://lkml.kernel.org/r/20230201074433.96641-1-wangkefeng.wang@huawei.com
      Link: https://lkml.kernel.org/r/20221209072801.193221-1-wangkefeng.wang@huawei.com
      
      
      Signed-off-by: default avatarKefeng Wang <wangkefeng.wang@huawei.com>
      Reviewed-by: default avatarNaoya Horiguchi <naoya.horiguchi@nec.com>
      Cc: Miaohe Lin <linmiaohe@huawei.com>
      Cc: Tony Luck <tony.luck@intel.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      6b970599
    • Christophe Leroy's avatar
      kasan: fix Oops due to missing calls to kasan_arch_is_ready() · 55d77bae
      Christophe Leroy authored
      On powerpc64, you can build a kernel with KASAN as soon as you build it
      with RADIX MMU support.  However if the CPU doesn't have RADIX MMU, KASAN
      isn't enabled at init and the following Oops is encountered.
      
        [    0.000000][    T0] KASAN not enabled as it requires radix!
      
        [    4.484295][   T26] BUG: Unable to handle kernel data access at 0xc00e000000804a04
        [    4.485270][   T26] Faulting instruction address: 0xc00000000062ec6c
        [    4.485748][   T26] Oops: Kernel access of bad area, sig: 11 [#1]
        [    4.485920][   T26] BE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
        [    4.486259][   T26] Modules linked in:
        [    4.486637][   T26] CPU: 0 PID: 26 Comm: kworker/u2:2 Not tainted 6.2.0-rc3-02590-gf8a023b0a805 #249
        [    4.486907][   T26] Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1200 0xf000005 of:SLOF,HEAD pSeries
        [    4.487445][   T26] Workqueue: eval_map_wq .tracer_init_tracefs_work_func
        [    4.488744][   T26] NIP:  c00000000062ec6c LR: c00000000062bb84 CTR: c0000000002ebcd0
        [    4.488867][   T26] REGS: c0000000049175c0 TRAP: 0380   Not tainted  (6.2.0-rc3-02590-gf8a023b0a805)
        [    4.489028][   T26] MSR:  8000000002009032 <SF,VEC,EE,ME,IR,DR,RI>  CR: 44002808  XER: 00000000
        [    4.489584][   T26] CFAR: c00000000062bb80 IRQMASK: 0
        [    4.489584][   T26] GPR00: c0000000005624d4 c000000004917860 c000000001cfc000 1800000000804a04
        [    4.489584][   T26] GPR04: c0000000003a2650 0000000000000cc0 c00000000000d3d8 c00000000000d3d8
        [    4.489584][   T26] GPR08: c0000000049175b0 a80e000000000000 0000000000000000 0000000017d78400
        [    4.489584][   T26] GPR12: 0000000044002204 c000000003790000 c00000000435003c c0000000043f1c40
        [    4.489584][   T26] GPR16: c0000000043f1c68 c0000000043501a0 c000000002106138 c0000000043f1c08
        [    4.489584][   T26] GPR20: c0000000043f1c10 c0000000043f1c20 c000000004146c40 c000000002fdb7f8
        [    4.489584][   T26] GPR24: c000000002fdb834 c000000003685e00 c000000004025030 c000000003522e90
        [    4.489584][   T26] GPR28: 0000000000000cc0 c0000000003a2650 c000000004025020 c000000004025020
        [    4.491201][   T26] NIP [c00000000062ec6c] .kasan_byte_accessible+0xc/0x20
        [    4.491430][   T26] LR [c00000000062bb84] .__kasan_check_byte+0x24/0x90
        [    4.491767][   T26] Call Trace:
        [    4.491941][   T26] [c000000004917860] [c00000000062ae70] .__kasan_kmalloc+0xc0/0x110 (unreliable)
        [    4.492270][   T26] [c0000000049178f0] [c0000000005624d4] .krealloc+0x54/0x1c0
        [    4.492453][   T26] [c000000004917990] [c0000000003a2650] .create_trace_option_files+0x280/0x530
        [    4.492613][   T26] [c000000004917a90] [c000000002050d90] .tracer_init_tracefs_work_func+0x274/0x2c0
        [    4.492771][   T26] [c000000004917b40] [c0000000001f9948] .process_one_work+0x578/0x9f0
        [    4.492927][   T26] [c000000004917c30] [c0000000001f9ebc] .worker_thread+0xfc/0x950
        [    4.493084][   T26] [c000000004917d60] [c00000000020be84] .kthread+0x1a4/0x1b0
        [    4.493232][   T26] [c000000004917e10] [c00000000000d3d8] .ret_from_kernel_thread+0x58/0x60
        [    4.495642][   T26] Code: 60000000 7cc802a6 38a00000 4bfffc78 60000000 7cc802a6 38a00001 4bfffc68 60000000 3d20a80e 7863e8c2 792907c6 <7c6348ae> 20630007 78630fe0 68630001
        [    4.496704][   T26] ---[ end trace 0000000000000000 ]---
      
      The Oops is due to kasan_byte_accessible() not checking the readiness of
      KASAN.  Add missing call to kasan_arch_is_ready() and bail out when not
      ready.  The same problem is observed with ____kasan_kfree_large() so fix
      it the same.
      
      Also, as KASAN is not available and no shadow area is allocated for linear
      memory mapping, there is no point in allocating shadow mem for vmalloc
      memory as shown below in /sys/kernel/debug/kernel_page_tables
      
        ---[ kasan shadow mem start ]---
        0xc00f000000000000-0xc00f00000006ffff  0x00000000040f0000       448K         r  w       pte  valid  present        dirty  accessed
        0xc00f000000860000-0xc00f00000086ffff  0x000000000ac10000        64K         r  w       pte  valid  present        dirty  accessed
        0xc00f3ffffffe0000-0xc00f3fffffffffff  0x0000000004d10000       128K         r  w       pte  valid  present        dirty  accessed
        ---[ kasan shadow mem end ]---
      
      So, also verify KASAN readiness before allocating and poisoning
      shadow mem for VMAs.
      
      Link: https://lkml.kernel.org/r/150768c55722311699fdcf8f5379e8256749f47d.1674716617.git.christophe.leroy@csgroup.eu
      
      
      Fixes: 41b7a347 ("powerpc: Book3S 64-bit outline-only KASAN support")
      Signed-off-by: default avatarChristophe Leroy <christophe.leroy@csgroup.eu>
      Reported-by: default avatarNathan Lynch <nathanl@linux.ibm.com>
      Suggested-by: default avatarMichael Ellerman <mpe@ellerman.id.au>
      Cc: Alexander Potapenko <glider@google.com>
      Cc: Andrey Konovalov <andreyknvl@gmail.com>
      Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
      Cc: Dmitry Vyukov <dvyukov@google.com>
      Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
      Cc: <stable@vger.kernel.org>	[5.19+]
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      55d77bae
  6. Feb 07, 2023
  7. Feb 04, 2023
  8. Feb 01, 2023
    • Longlong Xia's avatar
      mm/swapfile: add cond_resched() in get_swap_pages() · 7717fc1a
      Longlong Xia authored
      The softlockup still occurs in get_swap_pages() under memory pressure.  64
      CPU cores, 64GB memory, and 28 zram devices, the disksize of each zram
      device is 50MB with same priority as si.  Use the stress-ng tool to
      increase memory pressure, causing the system to oom frequently.
      
      The plist_for_each_entry_safe() loops in get_swap_pages() could reach tens
      of thousands of times to find available space (extreme case:
      cond_resched() is not called in scan_swap_map_slots()).  Let's add
      cond_resched() into get_swap_pages() when failed to find available space
      to avoid softlockup.
      
      Link: https://lkml.kernel.org/r/20230128094757.1060525-1-xialonglong1@huawei.com
      
      
      Signed-off-by: default avatarLonglong Xia <xialonglong1@huawei.com>
      Reviewed-by: default avatar"Huang, Ying" <ying.huang@intel.com>
      Cc: Chen Wandun <chenwandun@huawei.com>
      Cc: Huang Ying <ying.huang@intel.com>
      Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
      Cc: Nanyong Sun <sunnanyong@huawei.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: <stable@vger.kernel.org>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      7717fc1a
    • Zhaoyang Huang's avatar
      mm: use stack_depot_early_init for kmemleak · 993f57e0
      Zhaoyang Huang authored
      Mirsad report the below error which is caused by stack_depot_init()
      failure in kvcalloc.  Solve this by having stackdepot use
      stack_depot_early_init().
      
      On 1/4/23 17:08, Mirsad Goran Todorovac wrote:
      I hate to bring bad news again, but there seems to be a problem with the output of /sys/kernel/debug/kmemleak:
      
      [root@pc-mtodorov ~]# cat /sys/kernel/debug/kmemleak
      unreferenced object 0xffff951c118568b0 (size 16):
      comm "kworker/u12:2", pid 56, jiffies 4294893952 (age 4356.548s)
      hex dump (first 16 bytes):
          6d 65 6d 73 74 69 63 6b 30 00 00 00 00 00 00 00 memstick0.......
          backtrace:
      [root@pc-mtodorov ~]#
      
      Apparently, backtrace of called functions on the stack is no longer
      printed with the list of memory leaks.  This appeared on Lenovo desktop
      10TX000VCR, with AlmaLinux 8.7 and BIOS version M22KT49A (11/10/2022) and
      6.2-rc1 and 6.2-rc2 builds.  This worked on 6.1 with the same
      CONFIG_KMEMLEAK=y and MGLRU enabled on a vanilla mainstream kernel from
      Mr.  Torvalds' tree.  I don't know if this is deliberate feature for some
      reason or a bug.  Please find attached the config, lshw and kmemleak
      output.
      
      [vbabka@suse.cz: remove stack_depot_init() call]
      Link: https://lore.kernel.org/all/5272a819-ef74-65ff-be61-4d2d567337de@alu.unizg.hr/
      Link: https://lkml.kernel.org/r/1674091345-14799-2-git-send-email-zhaoyang.huang@unisoc.com
      
      
      Fixes: 56a61617 ("mm: use stack_depot for recording kmemleak's backtrace")
      Reported-by: default avatarMirsad Todorovac <mirsad.todorovac@alu.unizg.hr>
      Suggested-by: default avatarVlastimil Babka <vbabka@suse.cz>
      Signed-off-by: default avatarZhaoyang Huang <zhaoyang.huang@unisoc.com>
      Acked-by: default avatarMike Rapoport (IBM) <rppt@kernel.org>
      Acked-by: default avatarCatalin Marinas <catalin.marinas@arm.com>
      Acked-by: default avatarVlastimil Babka <vbabka@suse.cz>
      Tested-by: default avatarBorislav Petkov (AMD) <bp@alien8.de>
      Cc: ke.wang <ke.wang@unisoc.com>
      Cc: Nathan Chancellor <nathan@kernel.org>
      Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      993f57e0
    • Mike Kravetz's avatar
      migrate: hugetlb: check for hugetlb shared PMD in node migration · 73bdf65e
      Mike Kravetz authored
      migrate_pages/mempolicy semantics state that CAP_SYS_NICE is required to
      move pages shared with another process to a different node.  page_mapcount
      > 1 is being used to determine if a hugetlb page is shared.  However, a
      hugetlb page will have a mapcount of 1 if mapped by multiple processes via
      a shared PMD.  As a result, hugetlb pages shared by multiple processes and
      mapped with a shared PMD can be moved by a process without CAP_SYS_NICE.
      
      To fix, check for a shared PMD if mapcount is 1.  If a shared PMD is found
      consider the page shared.
      
      Link: https://lkml.kernel.org/r/20230126222721.222195-3-mike.kravetz@oracle.com
      
      
      Fixes: e2d8cf40 ("migrate: add hugepage migration code to migrate_pages()")
      Signed-off-by: default avatarMike Kravetz <mike.kravetz@oracle.com>
      Acked-by: default avatarPeter Xu <peterx@redhat.com>
      Acked-by: default avatarDavid Hildenbrand <david@redhat.com>
      Cc: James Houghton <jthoughton@google.com>
      Cc: Matthew Wilcox <willy@infradead.org>
      Cc: Michal Hocko <mhocko@suse.com>
      Cc: Muchun Song <songmuchun@bytedance.com>
      Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
      Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
      Cc: Yang Shi <shy828301@gmail.com>
      Cc: <stable@vger.kernel.org>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      73bdf65e
    • Zach O'Keefe's avatar
      mm/MADV_COLLAPSE: catch !none !huge !bad pmd lookups · edb5d0cf
      Zach O'Keefe authored
      In commit 34488399 ("mm/madvise: add file and shmem support to
      MADV_COLLAPSE") we make the following change to find_pmd_or_thp_or_none():
      
      	-       if (!pmd_present(pmde))
      	-               return SCAN_PMD_NULL;
      	+       if (pmd_none(pmde))
      	+               return SCAN_PMD_NONE;
      
      This was for-use by MADV_COLLAPSE file/shmem codepaths, where
      MADV_COLLAPSE might identify a pte-mapped hugepage, only to have
      khugepaged race-in, free the pte table, and clear the pmd.  Such codepaths
      include:
      
      A) If we find a suitably-aligned compound page of order HPAGE_PMD_ORDER
         already in the pagecache.
      B) In retract_page_tables(), if we fail to grab mmap_lock for the target
         mm/address.
      
      In these cases, collapse_pte_mapped_thp() really does expect a none (not
      just !present) pmd, and we want to suitably identify that case separate
      from the case where no pmd is found, or it's a bad-pmd (of course, many
      things could happen once we drop mmap_lock, and the pmd could plausibly
      undergo multiple transitions due to intervening fault, split, etc). 
      Regardless, the code is prepared install a huge-pmd only when the existing
      pmd entry is either a genuine pte-table-mapping-pmd, or the none-pmd.
      
      However, the commit introduces a logical hole; namely, that we've allowed
      !none- && !huge- && !bad-pmds to be classified as genuine
      pte-table-mapping-pmds.  One such example that could leak through are swap
      entries.  The pmd values aren't checked again before use in
      pte_offset_map_lock(), which is expecting nothing less than a genuine
      pte-table-mapping-pmd.
      
      We want to put back the !pmd_present() check (below the pmd_none() check),
      but need to be careful to deal with subtleties in pmd transitions and
      treatments by various arch.
      
      The issue is that __split_huge_pmd_locked() temporarily clears the present
      bit (or otherwise marks the entry as invalid), but pmd_present() and
      pmd_trans_huge() still need to return true while the pmd is in this
      transitory state.  For example, x86's pmd_present() also checks the
      _PAGE_PSE , riscv's version also checks the _PAGE_LEAF bit, and arm64 also
      checks a PMD_PRESENT_INVALID bit.
      
      Covering all 4 cases for x86 (all checks done on the same pmd value):
      
      1) pmd_present() && pmd_trans_huge()
         All we actually know here is that the PSE bit is set. Either:
         a) We aren't racing with __split_huge_page(), and PRESENT or PROTNONE
            is set.
            => huge-pmd
         b) We are currently racing with __split_huge_page().  The danger here
            is that we proceed as-if we have a huge-pmd, but really we are
            looking at a pte-mapping-pmd.  So, what is the risk of this
            danger?
      
            The only relevant path is:
      
      	madvise_collapse() -> collapse_pte_mapped_thp()
      
            Where we might just incorrectly report back "success", when really
            the memory isn't pmd-backed.  This is fine, since split could
            happen immediately after (actually) successful madvise_collapse().
            So, it should be safe to just assume huge-pmd here.
      
      2) pmd_present() && !pmd_trans_huge()
         Either:
         a) PSE not set and either PRESENT or PROTNONE is.
            => pte-table-mapping pmd (or PROT_NONE)
         b) devmap.  This routine can be called immediately after
            unlocking/locking mmap_lock -- or called with no locks held (see
            khugepaged_scan_mm_slot()), so previous VMA checks have since been
            invalidated.
      
      3) !pmd_present() && pmd_trans_huge()
        Not possible.
      
      4) !pmd_present() && !pmd_trans_huge()
        Neither PRESENT nor PROTNONE set
        => not present
      
      I've checked all archs that implement pmd_trans_huge() (arm64, riscv,
      powerpc, longarch, x86, mips, s390) and this logic roughly translates
      (though devmap treatment is unique to x86 and powerpc, and (3) doesn't
      necessarily hold in general -- but that doesn't matter since
      !pmd_present() always takes failure path).
      
      Also, add a comment above find_pmd_or_thp_or_none() to help future
      travelers reason about the validity of the code; namely, the possible
      mutations that might happen out from under us, depending on how mmap_lock
      is held (if at all).
      
      Link: https://lkml.kernel.org/r/20230125225358.2576151-1-zokeefe@google.com
      
      
      Fixes: 34488399 ("mm/madvise: add file and shmem support to MADV_COLLAPSE")
      Signed-off-by: default avatarZach O'Keefe <zokeefe@google.com>
      Reported-by: default avatarHugh Dickins <hughd@google.com>
      Reviewed-by: default avatarYang Shi <shy828301@gmail.com>
      Cc: <stable@vger.kernel.org>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      edb5d0cf
    • Vlastimil Babka's avatar
      mm, mremap: fix mremap() expanding for vma's with vm_ops->close() · d014cd7c
      Vlastimil Babka authored
      Fabian has reported another regression in 6.1 due to ca3d76b0 ("mm:
      add merging after mremap resize").  The problem is that vma_merge() can
      fail when vma has a vm_ops->close() method, causing is_mergeable_vma()
      test to be negative.  This was happening for vma mapping a file from
      fuse-overlayfs, which does have the method.  But when we are simply
      expanding the vma, we never remove it due to the "merge" with the added
      area, so the test should not prevent the expansion.
      
      As a quick fix, check for such vmas and expand them using vma_adjust()
      directly as was done before commit ca3d76b0.  For a more robust long
      term solution we should try to limit the check for vma_ops->close only to
      cases that actually result in vma removal, so that no merge would be
      prevented unnecessarily.
      
      [akpm@linux-foundation.org: fix indenting whitespace, reflow comment]
      Link: https://lkml.kernel.org/r/20230117101939.9753-1-vbabka@suse.cz
      
      
      Fixes: ca3d76b0 ("mm: add merging after mremap resize")
      Signed-off-by: default avatarVlastimil Babka <vbabka@suse.cz>
      Reported-by: default avatarFabian Vogt <fvogt@suse.com>
        Link: https://bugzilla.suse.com/show_bug.cgi?id=1206359#c35
      
      
      Tested-by: default avatarFabian Vogt <fvogt@suse.com>
      Cc: Jakub Matěna <matenajakub@gmail.com>
      Cc: <stable@vger.kernel.org>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      d014cd7c
    • Yu Zhao's avatar
      mm: multi-gen LRU: fix crash during cgroup migration · de08eaa6
      Yu Zhao authored
      lru_gen_migrate_mm() assumes lru_gen_add_mm() runs prior to itself.  This
      isn't true for the following scenario:
      
          CPU 1                         CPU 2
      
        clone()
          cgroup_can_fork()
                                      cgroup_procs_write()
          cgroup_post_fork()
                                        task_lock()
                                        lru_gen_migrate_mm()
                                        task_unlock()
          task_lock()
          lru_gen_add_mm()
          task_unlock()
      
      And when the above happens, kernel crashes because of linked list
      corruption (mm_struct->lru_gen.list).
      
      Link: https://lore.kernel.org/r/20230115134651.30028-1-msizanoen@qtmlabs.xyz/
      Link: https://lkml.kernel.org/r/20230116034405.2960276-1-yuzhao@google.com
      
      
      Fixes: bd74fdae ("mm: multi-gen LRU: support page table walks")
      Signed-off-by: default avatarYu Zhao <yuzhao@google.com>
      Reported-by: default avatarmsizanoen <msizanoen@qtmlabs.xyz>
      Tested-by: default avatarmsizanoen <msizanoen@qtmlabs.xyz>
      Cc: <stable@vger.kernel.org>	[6.1+]
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      de08eaa6
    • Michal Hocko's avatar
      Revert "mm: add nodes= arg to memory.reclaim" · 55ab834a
      Michal Hocko authored
      This reverts commit 12a5d395.
      
      Although it is recognized that a finer grained pro-active reclaim is
      something we need and want the semantic of this implementation is really
      ambiguous.
      
      In a follow up discussion it became clear that there are two essential
      usecases here.  One is to use memory.reclaim to pro-actively reclaim
      memory and expectation is that the requested and reported amount of memory
      is uncharged from the memcg.  Another usecase focuses on pro-active
      demotion when the memory is merely shuffled around to demotion targets
      while the overall charged memory stays unchanged.
      
      The current implementation considers demoted pages as reclaimed and that
      break both usecases.  [1] has tried to address the reporting part but
      there are more issues with that summarized in [2] and follow up emails.
      
      Let's revert the nodemask based extension of the memcg pro-active
      reclaim for now until we settle with a more robust semantic.
      
      [1] http://lkml.kernel.org/r/http://lkml.kernel.org/r/20221206023406.3182800-1-almasrymina@google.com
      [2] http://lkml.kernel.org/r/Y5bsmpCyeryu3Zz1@dhcp22.suse.cz
      
      Link: https://lkml.kernel.org/r/Y5xASNe1x8cusiTx@dhcp22.suse.cz
      
      
      Fixes: 12a5d395 ("mm: add nodes= arg to memory.reclaim")
      Signed-off-by: default avatarMichal Hocko <mhocko@suse.com>
      Cc: Bagas Sanjaya <bagasdotme@gmail.com>
      Cc: Huang Ying <ying.huang@intel.com>
      Cc: Johannes Weiner <hannes@cmpxchg.org>
      Cc: Jonathan Corbet <corbet@lwn.net>
      Cc: Mina Almasry <almasrymina@google.com>
      Cc: Muchun Song <songmuchun@bytedance.com>
      Cc: Roman Gushchin <roman.gushchin@linux.dev>
      Cc: Shakeel Butt <shakeelb@google.com>
      Cc: Tejun Heo <tj@kernel.org>
      Cc: Wei Xu <weixugc@google.com>
      Cc: Yang Shi <yang.shi@linux.alibaba.com>
      Cc: Yosry Ahmed <yosryahmed@google.com>
      Cc: zefan li <lizefan.x@bytedance.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      55ab834a
    • Nhat Pham's avatar
      zsmalloc: fix a race with deferred_handles storing · 85b32581
      Nhat Pham authored
      Currently, there is a race between zs_free() and zs_reclaim_page():
      zs_reclaim_page() finds a handle to an allocated object, but before the
      eviction happens, an independent zs_free() call to the same handle could
      come in and overwrite the object value stored at the handle with the last
      deferred handle.  When zs_reclaim_page() finally gets to call the eviction
      handler, it will see an invalid object value (i.e the previous deferred
      handle instead of the original object value).
      
      This race happens quite infrequently.  We only managed to produce it with
      out-of-tree developmental code that triggers zsmalloc writeback with a
      much higher frequency than usual.
      
      This patch fixes this race by storing the deferred handle in the object
      header instead.  We differentiate the deferred handle from the other two
      cases (handle for allocated object, and linkage for free object) with a
      new tag.  If zspage reclamation succeeds, we will free these deferred
      handles by walking through the zspage objects.  On the other hand, if
      zspage reclamation fails, we reconstruct the zspage freelist (with the
      deferred handle tag and allocated tag) before trying again with the
      reclamation.
      
      [arnd@arndb.de: avoid unused-function warning]
        Link: https://lkml.kernel.org/r/20230117170507.2651972-1-arnd@kernel.org
      Link: https://lkml.kernel.org/r/20230110231701.326724-1-nphamcs@gmail.com
      
      
      Fixes: 9997bc01 ("zsmalloc: implement writeback mechanism for zsmalloc")
      Signed-off-by: default avatarNhat Pham <nphamcs@gmail.com>
      Signed-off-by: default avatarArnd Bergmann <arnd@arndb.de>
      Suggested-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
      Cc: Dan Streetman <ddstreet@ieee.org>
      Cc: Minchan Kim <minchan@kernel.org>
      Cc: Nitin Gupta <ngupta@vflare.org>
      Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
      Cc: Seth Jennings <sjenning@redhat.com>
      Cc: Vitaly Wool <vitaly.wool@konsulko.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      85b32581
    • Jann Horn's avatar
      mm/khugepaged: fix ->anon_vma race · 023f47a8
      Jann Horn authored
      If an ->anon_vma is attached to the VMA, collapse_and_free_pmd() requires
      it to be locked.
      
      Page table traversal is allowed under any one of the mmap lock, the
      anon_vma lock (if the VMA is associated with an anon_vma), and the
      mapping lock (if the VMA is associated with a mapping); and so to be
      able to remove page tables, we must hold all three of them. 
      retract_page_tables() bails out if an ->anon_vma is attached, but does
      this check before holding the mmap lock (as the comment above the check
      explains).
      
      If we racily merged an existing ->anon_vma (shared with a child
      process) from a neighboring VMA, subsequent rmap traversals on pages
      belonging to the child will be able to see the page tables that we are
      concurrently removing while assuming that nothing else can access them.
      
      Repeat the ->anon_vma check once we hold the mmap lock to ensure that
      there really is no concurrent page table access.
      
      Hitting this bug causes a lockdep warning in collapse_and_free_pmd(),
      in the line "lockdep_assert_held_write(&vma->anon_vma->root->rwsem)". 
      It can also lead to use-after-free access.
      
      Link: https://lore.kernel.org/linux-mm/CAG48ez3434wZBKFFbdx4M9j6eUwSUVPd4dxhzW_k_POneSDF+A@mail.gmail.com/
      Link: https://lkml.kernel.org/r/20230111133351.807024-1-jannh@google.com
      
      
      Fixes: f3f0e1d2 ("khugepaged: add support of collapse for tmpfs/shmem pages")
      Signed-off-by: default avatarJann Horn <jannh@google.com>
      Reported-by: default avatarZach O'Keefe <zokeefe@google.com>
      Acked-by: default avatarKirill A. Shutemov <kirill.shutemov@intel.linux.com>
      Reviewed-by: default avatarYang Shi <shy828301@gmail.com>
      Cc: David Hildenbrand <david@redhat.com>
      Cc: <stable@vger.kernel.org>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      023f47a8
  9. Jan 29, 2023
  10. Jan 19, 2023
  11. Jan 12, 2023
Loading