1
0
Fork 0

Merge branch 'akpm' (patches from Andrew)

Merge more updates from Andrew Morton:

 - almost all of the rest of MM

 - kasan updates

 - lots of procfs work

 - misc things

 - lib/ updates

 - checkpatch

 - rapidio

 - ipc/shm updates

 - the start of willy's XArray conversion

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (140 commits)
  page cache: use xa_lock
  xarray: add the xa_lock to the radix_tree_root
  fscache: use appropriate radix tree accessors
  export __set_page_dirty
  unicore32: turn flush_dcache_mmap_lock into a no-op
  arm64: turn flush_dcache_mmap_lock into a no-op
  mac80211_hwsim: use DEFINE_IDA
  radix tree: use GFP_ZONEMASK bits of gfp_t for flags
  linux/const.h: refactor _BITUL and _BITULL a bit
  linux/const.h: move UL() macro to include/linux/const.h
  linux/const.h: prefix include guard of uapi/linux/const.h with _UAPI
  xen, mm: allow deferred page initialization for xen pv domains
  elf: enforce MAP_FIXED on overlaying elf segments
  fs, elf: drop MAP_FIXED usage from elf_map
  mm: introduce MAP_FIXED_NOREPLACE
  MAINTAINERS: update bouncing aacraid@adaptec.com addresses
  fs/dcache.c: add cond_resched() in shrink_dentry_list()
  include/linux/kfifo.h: fix comment
  ipc/shm.c: shm_split(): remove unneeded test for NULL shm_file_data.vm_ops
  kernel/sysctl.c: add kdoc comments to do_proc_do{u}intvec_minmax_conv_param
  ...
hifive-unleashed-5.1
Linus Torvalds 2018-04-11 10:51:26 -07:00
commit 8837c70d53
182 changed files with 4164 additions and 2100 deletions

428
.clang-format 100644
View File

@ -0,0 +1,428 @@
# SPDX-License-Identifier: GPL-2.0
#
# clang-format configuration file. Intended for clang-format >= 4.
#
# For more information, see:
#
# Documentation/process/clang-format.rst
# https://clang.llvm.org/docs/ClangFormat.html
# https://clang.llvm.org/docs/ClangFormatStyleOptions.html
#
---
AccessModifierOffset: -4
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
#AlignEscapedNewlines: Left # Unknown to clang-format-4.0
AlignOperands: true
AlignTrailingComments: false
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: None
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: false
BinPackArguments: true
BinPackParameters: true
BraceWrapping:
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: true
AfterNamespace: true
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
#AfterExternBlock: false # Unknown to clang-format-5.0
BeforeCatch: false
BeforeElse: false
IndentBraces: false
#SplitEmptyFunction: true # Unknown to clang-format-4.0
#SplitEmptyRecord: true # Unknown to clang-format-4.0
#SplitEmptyNamespace: true # Unknown to clang-format-4.0
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Custom
#BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0
BreakBeforeTernaryOperators: false
BreakConstructorInitializersBeforeComma: false
#BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: false
ColumnLimit: 80
CommentPragmas: '^ IWYU pragma:'
#CompactNamespaces: false # Unknown to clang-format-4.0
ConstructorInitializerAllOnOneLineOrOnePerLine: false
ConstructorInitializerIndentWidth: 8
ContinuationIndentWidth: 8
Cpp11BracedListStyle: false
DerivePointerAlignment: false
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
#FixNamespaceComments: false # Unknown to clang-format-4.0
# Taken from:
# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ \
# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \
# | sort | uniq
ForEachMacros:
- 'apei_estatus_for_each_section'
- 'ata_for_each_dev'
- 'ata_for_each_link'
- 'ax25_for_each'
- 'ax25_uid_for_each'
- 'bio_for_each_integrity_vec'
- '__bio_for_each_segment'
- 'bio_for_each_segment'
- 'bio_for_each_segment_all'
- 'bio_list_for_each'
- 'bip_for_each_vec'
- 'blkg_for_each_descendant_post'
- 'blkg_for_each_descendant_pre'
- 'blk_queue_for_each_rl'
- 'bond_for_each_slave'
- 'bond_for_each_slave_rcu'
- 'btree_for_each_safe128'
- 'btree_for_each_safe32'
- 'btree_for_each_safe64'
- 'btree_for_each_safel'
- 'card_for_each_dev'
- 'cgroup_taskset_for_each'
- 'cgroup_taskset_for_each_leader'
- 'cpufreq_for_each_entry'
- 'cpufreq_for_each_entry_idx'
- 'cpufreq_for_each_valid_entry'
- 'cpufreq_for_each_valid_entry_idx'
- 'css_for_each_child'
- 'css_for_each_descendant_post'
- 'css_for_each_descendant_pre'
- 'device_for_each_child_node'
- 'drm_atomic_crtc_for_each_plane'
- 'drm_atomic_crtc_state_for_each_plane'
- 'drm_atomic_crtc_state_for_each_plane_state'
- 'drm_for_each_connector_iter'
- 'drm_for_each_crtc'
- 'drm_for_each_encoder'
- 'drm_for_each_encoder_mask'
- 'drm_for_each_fb'
- 'drm_for_each_legacy_plane'
- 'drm_for_each_plane'
- 'drm_for_each_plane_mask'
- 'drm_mm_for_each_hole'
- 'drm_mm_for_each_node'
- 'drm_mm_for_each_node_in_range'
- 'drm_mm_for_each_node_safe'
- 'for_each_active_drhd_unit'
- 'for_each_active_iommu'
- 'for_each_available_child_of_node'
- 'for_each_bio'
- 'for_each_board_func_rsrc'
- 'for_each_bvec'
- 'for_each_child_of_node'
- 'for_each_clear_bit'
- 'for_each_clear_bit_from'
- 'for_each_cmsghdr'
- 'for_each_compatible_node'
- 'for_each_console'
- 'for_each_cpu'
- 'for_each_cpu_and'
- 'for_each_cpu_not'
- 'for_each_cpu_wrap'
- 'for_each_dev_addr'
- 'for_each_dma_cap_mask'
- 'for_each_drhd_unit'
- 'for_each_dss_dev'
- 'for_each_efi_memory_desc'
- 'for_each_efi_memory_desc_in_map'
- 'for_each_endpoint_of_node'
- 'for_each_evictable_lru'
- 'for_each_fib6_node_rt_rcu'
- 'for_each_fib6_walker_rt'
- 'for_each_free_mem_range'
- 'for_each_free_mem_range_reverse'
- 'for_each_func_rsrc'
- 'for_each_hstate'
- 'for_each_if'
- 'for_each_iommu'
- 'for_each_ip_tunnel_rcu'
- 'for_each_irq_nr'
- 'for_each_lru'
- 'for_each_matching_node'
- 'for_each_matching_node_and_match'
- 'for_each_memblock'
- 'for_each_memblock_type'
- 'for_each_memcg_cache_index'
- 'for_each_mem_pfn_range'
- 'for_each_mem_range'
- 'for_each_mem_range_rev'
- 'for_each_migratetype_order'
- 'for_each_msi_entry'
- 'for_each_net'
- 'for_each_netdev'
- 'for_each_netdev_continue'
- 'for_each_netdev_continue_rcu'
- 'for_each_netdev_feature'
- 'for_each_netdev_in_bond_rcu'
- 'for_each_netdev_rcu'
- 'for_each_netdev_reverse'
- 'for_each_netdev_safe'
- 'for_each_net_rcu'
- 'for_each_new_connector_in_state'
- 'for_each_new_crtc_in_state'
- 'for_each_new_plane_in_state'
- 'for_each_new_private_obj_in_state'
- 'for_each_node'
- 'for_each_node_by_name'
- 'for_each_node_by_type'
- 'for_each_node_mask'
- 'for_each_node_state'
- 'for_each_node_with_cpus'
- 'for_each_node_with_property'
- 'for_each_of_allnodes'
- 'for_each_of_allnodes_from'
- 'for_each_of_pci_range'
- 'for_each_old_connector_in_state'
- 'for_each_old_crtc_in_state'
- 'for_each_oldnew_connector_in_state'
- 'for_each_oldnew_crtc_in_state'
- 'for_each_oldnew_plane_in_state'
- 'for_each_oldnew_private_obj_in_state'
- 'for_each_old_plane_in_state'
- 'for_each_old_private_obj_in_state'
- 'for_each_online_cpu'
- 'for_each_online_node'
- 'for_each_online_pgdat'
- 'for_each_pci_bridge'
- 'for_each_pci_dev'
- 'for_each_pci_msi_entry'
- 'for_each_populated_zone'
- 'for_each_possible_cpu'
- 'for_each_present_cpu'
- 'for_each_prime_number'
- 'for_each_prime_number_from'
- 'for_each_process'
- 'for_each_process_thread'
- 'for_each_property_of_node'
- 'for_each_reserved_mem_region'
- 'for_each_resv_unavail_range'
- 'for_each_rtdcom'
- 'for_each_rtdcom_safe'
- 'for_each_set_bit'
- 'for_each_set_bit_from'
- 'for_each_sg'
- 'for_each_sg_page'
- '__for_each_thread'
- 'for_each_thread'
- 'for_each_zone'
- 'for_each_zone_zonelist'
- 'for_each_zone_zonelist_nodemask'
- 'fwnode_for_each_available_child_node'
- 'fwnode_for_each_child_node'
- 'fwnode_graph_for_each_endpoint'
- 'gadget_for_each_ep'
- 'hash_for_each'
- 'hash_for_each_possible'
- 'hash_for_each_possible_rcu'
- 'hash_for_each_possible_rcu_notrace'
- 'hash_for_each_possible_safe'
- 'hash_for_each_rcu'
- 'hash_for_each_safe'
- 'hctx_for_each_ctx'
- 'hlist_bl_for_each_entry'
- 'hlist_bl_for_each_entry_rcu'
- 'hlist_bl_for_each_entry_safe'
- 'hlist_for_each'
- 'hlist_for_each_entry'
- 'hlist_for_each_entry_continue'
- 'hlist_for_each_entry_continue_rcu'
- 'hlist_for_each_entry_continue_rcu_bh'
- 'hlist_for_each_entry_from'
- 'hlist_for_each_entry_from_rcu'
- 'hlist_for_each_entry_rcu'
- 'hlist_for_each_entry_rcu_bh'
- 'hlist_for_each_entry_rcu_notrace'
- 'hlist_for_each_entry_safe'
- '__hlist_for_each_rcu'
- 'hlist_for_each_safe'
- 'hlist_nulls_for_each_entry'
- 'hlist_nulls_for_each_entry_from'
- 'hlist_nulls_for_each_entry_rcu'
- 'hlist_nulls_for_each_entry_safe'
- 'ide_host_for_each_port'
- 'ide_port_for_each_dev'
- 'ide_port_for_each_present_dev'
- 'idr_for_each_entry'
- 'idr_for_each_entry_continue'
- 'idr_for_each_entry_ul'
- 'inet_bind_bucket_for_each'
- 'inet_lhash2_for_each_icsk_rcu'
- 'iov_for_each'
- 'key_for_each'
- 'key_for_each_safe'
- 'klp_for_each_func'
- 'klp_for_each_object'
- 'kvm_for_each_memslot'
- 'kvm_for_each_vcpu'
- 'list_for_each'
- 'list_for_each_entry'
- 'list_for_each_entry_continue'
- 'list_for_each_entry_continue_rcu'
- 'list_for_each_entry_continue_reverse'
- 'list_for_each_entry_from'
- 'list_for_each_entry_from_reverse'
- 'list_for_each_entry_lockless'
- 'list_for_each_entry_rcu'
- 'list_for_each_entry_reverse'
- 'list_for_each_entry_safe'
- 'list_for_each_entry_safe_continue'
- 'list_for_each_entry_safe_from'
- 'list_for_each_entry_safe_reverse'
- 'list_for_each_prev'
- 'list_for_each_prev_safe'
- 'list_for_each_safe'
- 'llist_for_each'
- 'llist_for_each_entry'
- 'llist_for_each_entry_safe'
- 'llist_for_each_safe'
- 'media_device_for_each_entity'
- 'media_device_for_each_intf'
- 'media_device_for_each_link'
- 'media_device_for_each_pad'
- 'netdev_for_each_lower_dev'
- 'netdev_for_each_lower_private'
- 'netdev_for_each_lower_private_rcu'
- 'netdev_for_each_mc_addr'
- 'netdev_for_each_uc_addr'
- 'netdev_for_each_upper_dev_rcu'
- 'netdev_hw_addr_list_for_each'
- 'nft_rule_for_each_expr'
- 'nla_for_each_attr'
- 'nla_for_each_nested'
- 'nlmsg_for_each_attr'
- 'nlmsg_for_each_msg'
- 'nr_neigh_for_each'
- 'nr_neigh_for_each_safe'
- 'nr_node_for_each'
- 'nr_node_for_each_safe'
- 'of_for_each_phandle'
- 'of_property_for_each_string'
- 'of_property_for_each_u32'
- 'pci_bus_for_each_resource'
- 'ping_portaddr_for_each_entry'
- 'plist_for_each'
- 'plist_for_each_continue'
- 'plist_for_each_entry'
- 'plist_for_each_entry_continue'
- 'plist_for_each_entry_safe'
- 'plist_for_each_safe'
- 'pnp_for_each_card'
- 'pnp_for_each_dev'
- 'protocol_for_each_card'
- 'protocol_for_each_dev'
- 'queue_for_each_hw_ctx'
- 'radix_tree_for_each_contig'
- 'radix_tree_for_each_slot'
- 'radix_tree_for_each_tagged'
- 'rbtree_postorder_for_each_entry_safe'
- 'resource_list_for_each_entry'
- 'resource_list_for_each_entry_safe'
- 'rhl_for_each_entry_rcu'
- 'rhl_for_each_rcu'
- 'rht_for_each'
- 'rht_for_each_continue'
- 'rht_for_each_entry'
- 'rht_for_each_entry_continue'
- 'rht_for_each_entry_rcu'
- 'rht_for_each_entry_rcu_continue'
- 'rht_for_each_entry_safe'
- 'rht_for_each_rcu'
- 'rht_for_each_rcu_continue'
- '__rq_for_each_bio'
- 'rq_for_each_segment'
- 'scsi_for_each_prot_sg'
- 'scsi_for_each_sg'
- 'sctp_for_each_hentry'
- 'sctp_skb_for_each'
- 'shdma_for_each_chan'
- '__shost_for_each_device'
- 'shost_for_each_device'
- 'sk_for_each'
- 'sk_for_each_bound'
- 'sk_for_each_entry_offset_rcu'
- 'sk_for_each_from'
- 'sk_for_each_rcu'
- 'sk_for_each_safe'
- 'sk_nulls_for_each'
- 'sk_nulls_for_each_from'
- 'sk_nulls_for_each_rcu'
- 'snd_pcm_group_for_each_entry'
- 'snd_soc_dapm_widget_for_each_path'
- 'snd_soc_dapm_widget_for_each_path_safe'
- 'snd_soc_dapm_widget_for_each_sink_path'
- 'snd_soc_dapm_widget_for_each_source_path'
- 'tb_property_for_each'
- 'udp_portaddr_for_each_entry'
- 'udp_portaddr_for_each_entry_rcu'
- 'usb_hub_for_each_child'
- 'v4l2_device_for_each_subdev'
- 'v4l2_m2m_for_each_dst_buf'
- 'v4l2_m2m_for_each_dst_buf_safe'
- 'v4l2_m2m_for_each_src_buf'
- 'v4l2_m2m_for_each_src_buf_safe'
- 'zorro_for_each_dev'
#IncludeBlocks: Preserve # Unknown to clang-format-5.0
IncludeCategories:
- Regex: '.*'
Priority: 1
IncludeIsMainRegex: '(Test)?$'
IndentCaseLabels: false
#IndentPPDirectives: None # Unknown to clang-format-5.0
IndentWidth: 8
IndentWrappedFunctionNames: true
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: Inner
#ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0
ObjCBlockIndentWidth: 8
ObjCSpaceAfterProperty: true
ObjCSpaceBeforeProtocolList: true
# Taken from git's rules
#PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0
PenaltyBreakBeforeFirstCallParameter: 30
PenaltyBreakComment: 10
PenaltyBreakFirstLessLess: 0
PenaltyBreakString: 10
PenaltyExcessCharacter: 100
PenaltyReturnTypeOnItsOwnLine: 60
PointerAlignment: Right
ReflowComments: false
SortIncludes: false
#SortUsingDeclarations: false # Unknown to clang-format-4.0
SpaceAfterCStyleCast: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
#SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0
#SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0
SpaceBeforeParens: ControlStatements
#SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: false
SpacesInContainerLiterals: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Cpp03
TabWidth: 8
UseTab: Always
...

1
.gitignore vendored
View File

@ -81,6 +81,7 @@ modules.builtin
!.gitignore
!.mailmap
!.cocciconfig
!.clang-format
#
# Generated include files

View File

@ -262,7 +262,7 @@ When oom event notifier is registered, event will be delivered.
2.6 Locking
lock_page_cgroup()/unlock_page_cgroup() should not be called under
mapping->tree_lock.
the i_pages lock.
Other lock order is following:
PG_locked.

View File

@ -58,6 +58,14 @@ can never be transgressed. If there is a good reason to go against the
style (a line which becomes far less readable if split to fit within the
80-column limit, for example), just do it.
Note that you can also use the ``clang-format`` tool to help you with
these rules, to quickly re-format parts of your code automatically,
and to review full files in order to spot coding style mistakes,
typos and possible improvements. It is also handy for sorting ``#includes``,
for aligning variables/macros, for reflowing text and other similar tasks.
See the file :ref:`Documentation/process/clang-format.rst <clangformat>`
for more details.
Abstraction layers
******************

View File

@ -0,0 +1,184 @@
.. _clangformat:
clang-format
============
``clang-format`` is a tool to format C/C++/... code according to
a set of rules and heuristics. Like most tools, it is not perfect
nor covers every single case, but it is good enough to be helpful.
``clang-format`` can be used for several purposes:
- Quickly reformat a block of code to the kernel style. Specially useful
when moving code around and aligning/sorting. See clangformatreformat_.
- Spot style mistakes, typos and possible improvements in files
you maintain, patches you review, diffs, etc. See clangformatreview_.
- Help you follow the coding style rules, specially useful for those
new to kernel development or working at the same time in several
projects with different coding styles.
Its configuration file is ``.clang-format`` in the root of the kernel tree.
The rules contained there try to approximate the most common kernel
coding style. They also try to follow :ref:`Documentation/process/coding-style.rst <codingstyle>`
as much as possible. Since not all the kernel follows the same style,
it is possible that you may want to tweak the defaults for a particular
subsystem or folder. To do so, you can override the defaults by writing
another ``.clang-format`` file in a subfolder.
The tool itself has already been included in the repositories of popular
Linux distributions for a long time. Search for ``clang-format`` in
your repositories. Otherwise, you can either download pre-built
LLVM/clang binaries or build the source code from:
http://releases.llvm.org/download.html
See more information about the tool at:
https://clang.llvm.org/docs/ClangFormat.html
https://clang.llvm.org/docs/ClangFormatStyleOptions.html
.. _clangformatreview:
Review files and patches for coding style
-----------------------------------------
By running the tool in its inline mode, you can review full subsystems,
folders or individual files for code style mistakes, typos or improvements.
To do so, you can run something like::
# Make sure your working directory is clean!
clang-format -i kernel/*.[ch]
And then take a look at the git diff.
Counting the lines of such a diff is also useful for improving/tweaking
the style options in the configuration file; as well as testing new
``clang-format`` features/versions.
``clang-format`` also supports reading unified diffs, so you can review
patches and git diffs easily. See the documentation at:
https://clang.llvm.org/docs/ClangFormat.html#script-for-patch-reformatting
To avoid ``clang-format`` formatting some portion of a file, you can do::
int formatted_code;
// clang-format off
void unformatted_code ;
// clang-format on
void formatted_code_again;
While it might be tempting to use this to keep a file always in sync with
``clang-format``, specially if you are writing new files or if you are
a maintainer, please note that people might be running different
``clang-format`` versions or not have it available at all. Therefore,
you should probably refrain yourself from using this in kernel sources;
at least until we see if ``clang-format`` becomes commonplace.
.. _clangformatreformat:
Reformatting blocks of code
---------------------------
By using an integration with your text editor, you can reformat arbitrary
blocks (selections) of code with a single keystroke. This is specially
useful when moving code around, for complex code that is deeply intended,
for multi-line macros (and aligning their backslashes), etc.
Remember that you can always tweak the changes afterwards in those cases
where the tool did not do an optimal job. But as a first approximation,
it can be very useful.
There are integrations for many popular text editors. For some of them,
like vim, emacs, BBEdit and Visual Studio you can find support built-in.
For instructions, read the appropiate section at:
https://clang.llvm.org/docs/ClangFormat.html
For Atom, Eclipse, Sublime Text, Visual Studio Code, XCode and other
editors and IDEs you should be able to find ready-to-use plugins.
For this use case, consider using a secondary ``.clang-format``
so that you can tweak a few options. See clangformatextra_.
.. _clangformatmissing:
Missing support
---------------
``clang-format`` is missing support for some things that are common
in kernel code. They are easy to remember, so if you use the tool
regularly, you will quickly learn to avoid/ignore those.
In particular, some very common ones you will notice are:
- Aligned blocks of one-line ``#defines``, e.g.::
#define TRACING_MAP_BITS_DEFAULT 11
#define TRACING_MAP_BITS_MAX 17
#define TRACING_MAP_BITS_MIN 7
vs.::
#define TRACING_MAP_BITS_DEFAULT 11
#define TRACING_MAP_BITS_MAX 17
#define TRACING_MAP_BITS_MIN 7
- Aligned designated initializers, e.g.::
static const struct file_operations uprobe_events_ops = {
.owner = THIS_MODULE,
.open = probes_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
.write = probes_write,
};
vs.::
static const struct file_operations uprobe_events_ops = {
.owner = THIS_MODULE,
.open = probes_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
.write = probes_write,
};
.. _clangformatextra:
Extra features/options
----------------------
Some features/style options are not enabled by default in the configuration
file in order to minimize the differences between the output and the current
code. In other words, to make the difference as small as possible,
which makes reviewing full-file style, as well diffs and patches as easy
as possible.
In other cases (e.g. particular subsystems/folders/files), the kernel style
might be different and enabling some of these options may approximate
better the style there.
For instance:
- Aligning assignments (``AlignConsecutiveAssignments``).
- Aligning declarations (``AlignConsecutiveDeclarations``).
- Reflowing text in comments (``ReflowComments``).
- Sorting ``#includes`` (``SortIncludes``).
They are typically useful for block re-formatting, rather than full-file.
You might want to create another ``.clang-format`` file and use that one
from your editor/IDE instead.

View File

@ -631,6 +631,14 @@ options ``-kr -i8`` (stands for ``K&R, 8 character indents``), or use
re-formatting you may want to take a look at the man page. But
remember: ``indent`` is not a fix for bad programming.
Note that you can also use the ``clang-format`` tool to help you with
these rules, to quickly re-format parts of your code automatically,
and to review full files in order to spot coding style mistakes,
typos and possible improvements. It is also handy for sorting ``#includes``,
for aligning variables/macros, for reflowing text and other similar tasks.
See the file :ref:`Documentation/process/clang-format.rst <clangformat>`
for more details.
10) Kconfig configuration files
-------------------------------

View File

@ -964,32 +964,34 @@ detect a hard lockup condition.
tainted:
Non-zero if the kernel has been tainted. Numeric values, which
can be ORed together:
Non-zero if the kernel has been tainted. Numeric values, which can be
ORed together. The letters are seen in "Tainted" line of Oops reports.
1 - A module with a non-GPL license has been loaded, this
includes modules with no license.
Set by modutils >= 2.4.9 and module-init-tools.
2 - A module was force loaded by insmod -f.
Set by modutils >= 2.4.9 and module-init-tools.
4 - Unsafe SMP processors: SMP with CPUs not designed for SMP.
8 - A module was forcibly unloaded from the system by rmmod -f.
16 - A hardware machine check error occurred on the system.
32 - A bad page was discovered on the system.
64 - The user has asked that the system be marked "tainted". This
could be because they are running software that directly modifies
the hardware, or for other reasons.
128 - The system has died.
256 - The ACPI DSDT has been overridden with one supplied by the user
instead of using the one provided by the hardware.
512 - A kernel warning has occurred.
1024 - A module from drivers/staging was loaded.
2048 - The system is working around a severe firmware bug.
4096 - An out-of-tree module has been loaded.
8192 - An unsigned module has been loaded in a kernel supporting module
signature.
16384 - A soft lockup has previously occurred on the system.
32768 - The kernel has been live patched.
1 (P): A module with a non-GPL license has been loaded, this
includes modules with no license.
Set by modutils >= 2.4.9 and module-init-tools.
2 (F): A module was force loaded by insmod -f.
Set by modutils >= 2.4.9 and module-init-tools.
4 (S): Unsafe SMP processors: SMP with CPUs not designed for SMP.
8 (R): A module was forcibly unloaded from the system by rmmod -f.
16 (M): A hardware machine check error occurred on the system.
32 (B): A bad page was discovered on the system.
64 (U): The user has asked that the system be marked "tainted". This
could be because they are running software that directly modifies
the hardware, or for other reasons.
128 (D): The system has died.
256 (A): The ACPI DSDT has been overridden with one supplied by the user
instead of using the one provided by the hardware.
512 (W): A kernel warning has occurred.
1024 (C): A module from drivers/staging was loaded.
2048 (I): The system is working around a severe firmware bug.
4096 (O): An out-of-tree module has been loaded.
8192 (E): An unsigned module has been loaded in a kernel supporting module
signature.
16384 (L): A soft lockup has previously occurred on the system.
32768 (K): The kernel has been live patched.
65536 (X): Auxiliary taint, defined and used by for distros.
131072 (T): The kernel was built with the struct randomization plugin.
==============================================================

View File

@ -312,8 +312,6 @@ The lowmem_reserve_ratio is an array. You can see them by reading this file.
% cat /proc/sys/vm/lowmem_reserve_ratio
256 256 32
-
Note: # of this elements is one fewer than number of zones. Because the highest
zone's value is not necessary for following calculation.
But, these values are not used directly. The kernel calculates # of protection
pages for each zones from them. These are shown as array of protection pages
@ -364,7 +362,8 @@ As above expression, they are reciprocal number of ratio.
pages of higher zones on the node.
If you would like to protect more pages, smaller values are effective.
The minimum value is 1 (1/1 -> 100%).
The minimum value is 1 (1/1 -> 100%). The value less than 1 completely
disables protection of the pages.
==============================================================

View File

@ -1,152 +1,160 @@
Heterogeneous Memory Management (HMM)
Transparently allow any component of a program to use any memory region of said
program with a device without using device specific memory allocator. This is
becoming a requirement to simplify the use of advance heterogeneous computing
where GPU, DSP or FPGA are use to perform various computations.
Provide infrastructure and helpers to integrate non-conventional memory (device
memory like GPU on board memory) into regular kernel path, with the cornerstone
of this being specialized struct page for such memory (see sections 5 to 7 of
this document).
This document is divided as follow, in the first section i expose the problems
related to the use of a device specific allocator. The second section i expose
the hardware limitations that are inherent to many platforms. The third section
gives an overview of HMM designs. The fourth section explains how CPU page-
table mirroring works and what is HMM purpose in this context. Fifth section
deals with how device memory is represented inside the kernel. Finaly the last
section present the new migration helper that allow to leverage the device DMA
engine.
HMM also provides optional helpers for SVM (Share Virtual Memory), i.e.,
allowing a device to transparently access program address coherently with the
CPU meaning that any valid pointer on the CPU is also a valid pointer for the
device. This is becoming mandatory to simplify the use of advanced hetero-
geneous computing where GPU, DSP, or FPGA are used to perform various
computations on behalf of a process.
This document is divided as follows: in the first section I expose the problems
related to using device specific memory allocators. In the second section, I
expose the hardware limitations that are inherent to many platforms. The third
section gives an overview of the HMM design. The fourth section explains how
CPU page-table mirroring works and the purpose of HMM in this context. The
fifth section deals with how device memory is represented inside the kernel.
Finally, the last section presents a new migration helper that allows lever-
aging the device DMA engine.
1) Problems of using device specific memory allocator:
2) System bus, device memory characteristics
3) Share address space and migration
1) Problems of using a device specific memory allocator:
2) I/O bus, device memory characteristics
3) Shared address space and migration
4) Address space mirroring implementation and API
5) Represent and manage device memory from core kernel point of view
6) Migrate to and from device memory
6) Migration to and from device memory
7) Memory cgroup (memcg) and rss accounting
-------------------------------------------------------------------------------
1) Problems of using device specific memory allocator:
1) Problems of using a device specific memory allocator:
Device with large amount of on board memory (several giga bytes) like GPU have
historically manage their memory through dedicated driver specific API. This
creates a disconnect between memory allocated and managed by device driver and
regular application memory (private anonymous, share memory or regular file
back memory). From here on i will refer to this aspect as split address space.
I use share address space to refer to the opposite situation ie one in which
any memory region can be use by device transparently.
Devices with a large amount of on board memory (several gigabytes) like GPUs
have historically managed their memory through dedicated driver specific APIs.
This creates a disconnect between memory allocated and managed by a device
driver and regular application memory (private anonymous, shared memory, or
regular file backed memory). From here on I will refer to this aspect as split
address space. I use shared address space to refer to the opposite situation:
i.e., one in which any application memory region can be used by a device
transparently.
Split address space because device can only access memory allocated through the
device specific API. This imply that all memory object in a program are not
equal from device point of view which complicate large program that rely on a
wide set of libraries.
Split address space happens because device can only access memory allocated
through device specific API. This implies that all memory objects in a program
are not equal from the device point of view which complicates large programs
that rely on a wide set of libraries.
Concretly this means that code that wants to leverage device like GPU need to
copy object between genericly allocated memory (malloc, mmap private/share/)
and memory allocated through the device driver API (this still end up with an
mmap but of the device file).
Concretely this means that code that wants to leverage devices like GPUs needs
to copy object between generically allocated memory (malloc, mmap private, mmap
share) and memory allocated through the device driver API (this still ends up
with an mmap but of the device file).
For flat dataset (array, grid, image, ...) this isn't too hard to achieve but
complex data-set (list, tree, ...) are hard to get right. Duplicating a complex
data-set need to re-map all the pointer relations between each of its elements.
This is error prone and program gets harder to debug because of the duplicate
data-set.
For flat data sets (array, grid, image, ...) this isn't too hard to achieve but
complex data sets (list, tree, ...) are hard to get right. Duplicating a
complex data set needs to re-map all the pointer relations between each of its
elements. This is error prone and program gets harder to debug because of the
duplicate data set and addresses.
Split address space also means that library can not transparently use data they
are getting from core program or other library and thus each library might have
to duplicate its input data-set using specific memory allocator. Large project
suffer from this and waste resources because of the various memory copy.
Split address space also means that libraries cannot transparently use data
they are getting from the core program or another library and thus each library
might have to duplicate its input data set using the device specific memory
allocator. Large projects suffer from this and waste resources because of the
various memory copies.
Duplicating each library API to accept as input or output memory allocted by
Duplicating each library API to accept as input or output memory allocated by
each device specific allocator is not a viable option. It would lead to a
combinatorial explosions in the library entry points.
combinatorial explosion in the library entry points.
Finaly with the advance of high level language constructs (in C++ but in other
language too) it is now possible for compiler to leverage GPU or other devices
without even the programmer knowledge. Some of compiler identified patterns are
only do-able with a share address. It is as well more reasonable to use a share
address space for all the other patterns.
Finally, with the advance of high level language constructs (in C++ but in
other languages too) it is now possible for the compiler to leverage GPUs and
other devices without programmer knowledge. Some compiler identified patterns
are only do-able with a shared address space. It is also more reasonable to use
a shared address space for all other patterns.
-------------------------------------------------------------------------------
2) System bus, device memory characteristics
2) I/O bus, device memory characteristics
System bus cripple share address due to few limitations. Most system bus only
allow basic memory access from device to main memory, even cache coherency is
often optional. Access to device memory from CPU is even more limited, most
often than not it is not cache coherent.
I/O buses cripple shared address spaces due to a few limitations. Most I/O
buses only allow basic memory access from device to main memory; even cache
coherency is often optional. Access to device memory from CPU is even more
limited. More often than not, it is not cache coherent.
If we only consider the PCIE bus than device can access main memory (often
through an IOMMU) and be cache coherent with the CPUs. However it only allows
a limited set of atomic operation from device on main memory. This is worse
in the other direction the CPUs can only access a limited range of the device
memory and can not perform atomic operations on it. Thus device memory can not
be consider like regular memory from kernel point of view.
If we only consider the PCIE bus, then a device can access main memory (often
through an IOMMU) and be cache coherent with the CPUs. However, it only allows
a limited set of atomic operations from device on main memory. This is worse
in the other direction: the CPU can only access a limited range of the device
memory and cannot perform atomic operations on it. Thus device memory cannot
be considered the same as regular memory from the kernel point of view.
Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0
and 16 lanes). This is 33 times less that fastest GPU memory (1 TBytes/s).
The final limitation is latency, access to main memory from the device has an
order of magnitude higher latency than when the device access its own memory.
and 16 lanes). This is 33 times less than the fastest GPU memory (1 TBytes/s).
The final limitation is latency. Access to main memory from the device has an
order of magnitude higher latency than when the device accesses its own memory.
Some platform are developing new system bus or additions/modifications to PCIE
to address some of those limitations (OpenCAPI, CCIX). They mainly allow two
Some platforms are developing new I/O buses or additions/modifications to PCIE
to address some of these limitations (OpenCAPI, CCIX). They mainly allow two-
way cache coherency between CPU and device and allow all atomic operations the
architecture supports. Saddly not all platform are following this trends and
some major architecture are left without hardware solutions to those problems.
architecture supports. Sadly, not all platforms are following this trend and
some major architectures are left without hardware solutions to these problems.
So for share address space to make sense not only we must allow device to
access any memory memory but we must also permit any memory to be migrated to
device memory while device is using it (blocking CPU access while it happens).
So for shared address space to make sense, not only must we allow devices to
access any memory but we must also permit any memory to be migrated to device
memory while device is using it (blocking CPU access while it happens).
-------------------------------------------------------------------------------
3) Share address space and migration
3) Shared address space and migration
HMM intends to provide two main features. First one is to share the address
space by duplication the CPU page table into the device page table so same
address point to same memory and this for any valid main memory address in
space by duplicating the CPU page table in the device page table so the same
address points to the same physical memory for any valid main memory address in
the process address space.
To achieve this, HMM offer a set of helpers to populate the device page table
To achieve this, HMM offers a set of helpers to populate the device page table
while keeping track of CPU page table updates. Device page table updates are
not as easy as CPU page table updates. To update the device page table you must
allow a buffer (or use a pool of pre-allocated buffer) and write GPU specifics
commands in it to perform the update (unmap, cache invalidations and flush,
...). This can not be done through common code for all device. Hence why HMM
provides helpers to factor out everything that can be while leaving the gory
details to the device driver.
not as easy as CPU page table updates. To update the device page table, you must
allocate a buffer (or use a pool of pre-allocated buffers) and write GPU
specific commands in it to perform the update (unmap, cache invalidations, and
flush, ...). This cannot be done through common code for all devices. Hence
why HMM provides helpers to factor out everything that can be while leaving the
hardware specific details to the device driver.
The second mechanism HMM provide is a new kind of ZONE_DEVICE memory that does
allow to allocate a struct page for each page of the device memory. Those page
are special because the CPU can not map them. They however allow to migrate
main memory to device memory using exhisting migration mechanism and everything
looks like if page was swap out to disk from CPU point of view. Using a struct
page gives the easiest and cleanest integration with existing mm mechanisms.
Again here HMM only provide helpers, first to hotplug new ZONE_DEVICE memory
for the device memory and second to perform migration. Policy decision of what
and when to migrate things is left to the device driver.
The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that
allows allocating a struct page for each page of the device memory. Those pages
are special because the CPU cannot map them. However, they allow migrating
main memory to device memory using existing migration mechanisms and everything
looks like a page is swapped out to disk from the CPU point of view. Using a
struct page gives the easiest and cleanest integration with existing mm mech-
anisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE
memory for the device memory and second to perform migration. Policy decisions
of what and when to migrate things is left to the device driver.
Note that any CPU access to a device page trigger a page fault and a migration
back to main memory ie when a page backing an given address A is migrated from
a main memory page to a device page then any CPU access to address A trigger a
page fault and initiate a migration back to main memory.
Note that any CPU access to a device page triggers a page fault and a migration
back to main memory. For example, when a page backing a given CPU address A is
migrated from a main memory page to a device page, then any CPU access to
address A triggers a page fault and initiates a migration back to main memory.
With this two features, HMM not only allow a device to mirror a process address
space and keeps both CPU and device page table synchronize, but also allow to
leverage device memory by migrating part of data-set that is actively use by a
device.
With these two features, HMM not only allows a device to mirror process address
space and keeping both CPU and device page table synchronized, but also lever-
ages device memory by migrating the part of the data set that is actively being
used by the device.
-------------------------------------------------------------------------------
4) Address space mirroring implementation and API
Address space mirroring main objective is to allow to duplicate range of CPU
page table into a device page table and HMM helps keeping both synchronize. A
device driver that want to mirror a process address space must start with the
Address space mirroring's main objective is to allow duplication of a range of
CPU page table into a device page table; HMM helps keep both synchronized. A
device driver that wants to mirror a process address space must start with the
registration of an hmm_mirror struct:
int hmm_mirror_register(struct hmm_mirror *mirror,
@ -154,9 +162,9 @@ registration of an hmm_mirror struct:
int hmm_mirror_register_locked(struct hmm_mirror *mirror,
struct mm_struct *mm);
The locked variant is to be use when the driver is already holding the mmap_sem
of the mm in write mode. The mirror struct has a set of callback that are use
to propagate CPU page table:
The locked variant is to be used when the driver is already holding mmap_sem
of the mm in write mode. The mirror struct has a set of callbacks that are used
to propagate CPU page tables:
struct hmm_mirror_ops {
/* sync_cpu_device_pagetables() - synchronize page tables
@ -181,13 +189,13 @@ to propagate CPU page table:
unsigned long end);
};
Device driver must perform update to the range following action (turn range
read only, or fully unmap, ...). Once driver callback returns the device must
be done with the update.
The device driver must perform the update action to the range (mark range
read only, or fully unmap, ...). The device must be done with the update before
the driver callback returns.
When device driver wants to populate a range of virtual address it can use
either:
When the device driver wants to populate a range of virtual addresses, it can
use either:
int hmm_vma_get_pfns(struct vm_area_struct *vma,
struct hmm_range *range,
unsigned long start,
@ -201,17 +209,19 @@ either:
bool write,
bool block);
First one (hmm_vma_get_pfns()) will only fetch present CPU page table entry and
will not trigger a page fault on missing or non present entry. The second one
do trigger page fault on missing or read only entry if write parameter is true.
Page fault use the generic mm page fault code path just like a CPU page fault.
The first one (hmm_vma_get_pfns()) will only fetch present CPU page table
entries and will not trigger a page fault on missing or non-present entries.
The second one does trigger a page fault on missing or read-only entry if the
write parameter is true. Page faults use the generic mm page fault code path
just like a CPU page fault.
Both function copy CPU page table into their pfns array argument. Each entry in
that array correspond to an address in the virtual range. HMM provide a set of
flags to help driver identify special CPU page table entries.
Both functions copy CPU page table entries into their pfns array argument. Each
entry in that array corresponds to an address in the virtual range. HMM
provides a set of flags to help the driver identify special CPU page table
entries.
Locking with the update() callback is the most important aspect the driver must
respect in order to keep things properly synchronize. The usage pattern is :
respect in order to keep things properly synchronized. The usage pattern is:
int driver_populate_range(...)
{
@ -233,43 +243,44 @@ respect in order to keep things properly synchronize. The usage pattern is :
return 0;
}
The driver->update lock is the same lock that driver takes inside its update()
callback. That lock must be call before hmm_vma_range_done() to avoid any race
with a concurrent CPU page table update.
The driver->update lock is the same lock that the driver takes inside its
update() callback. That lock must be held before hmm_vma_range_done() to avoid
any race with a concurrent CPU page table update.
HMM implements all this on top of the mmu_notifier API because we wanted to a
simpler API and also to be able to perform optimization latter own like doing
concurrent device update in multi-devices scenario.
HMM implements all this on top of the mmu_notifier API because we wanted a
simpler API and also to be able to perform optimizations latter on like doing
concurrent device updates in multi-devices scenario.
HMM also serve as an impedence missmatch between how CPU page table update are
done (by CPU write to the page table and TLB flushes) from how device update
their own page table. Device update is a multi-step process, first appropriate
commands are write to a buffer, then this buffer is schedule for execution on
the device. It is only once the device has executed commands in the buffer that
the update is done. Creating and scheduling update command buffer can happen
concurrently for multiple devices. Waiting for each device to report commands
as executed is serialize (there is no point in doing this concurrently).
HMM also serves as an impedance mismatch between how CPU page table updates
are done (by CPU write to the page table and TLB flushes) and how devices
update their own page table. Device updates are a multi-step process. First,
appropriate commands are written to a buffer, then this buffer is scheduled for
execution on the device. It is only once the device has executed commands in
the buffer that the update is done. Creating and scheduling the update command
buffer can happen concurrently for multiple devices. Waiting for each device to
report commands as executed is serialized (there is no point in doing this
concurrently).
-------------------------------------------------------------------------------
5) Represent and manage device memory from core kernel point of view
Several differents design were try to support device memory. First one use
device specific data structure to keep information about migrated memory and
HMM hooked itself in various place of mm code to handle any access to address
that were back by device memory. It turns out that this ended up replicating
most of the fields of struct page and also needed many kernel code path to be
updated to understand this new kind of memory.
Several different designs were tried to support device memory. First one used
a device specific data structure to keep information about migrated memory and
HMM hooked itself in various places of mm code to handle any access to
addresses that were backed by device memory. It turns out that this ended up
replicating most of the fields of struct page and also needed many kernel code
paths to be updated to understand this new kind of memory.
Thing is most kernel code path never try to access the memory behind a page
but only care about struct page contents. Because of this HMM switchted to
directly using struct page for device memory which left most kernel code path
un-aware of the difference. We only need to make sure that no one ever try to
map those page from the CPU side.
Most kernel code paths never try to access the memory behind a page
but only care about struct page contents. Because of this, HMM switched to
directly using struct page for device memory which left most kernel code paths
unaware of the difference. We only need to make sure that no one ever tries to
map those pages from the CPU side.
HMM provide a set of helpers to register and hotplug device memory as a new
region needing struct page. This is offer through a very simple API:
HMM provides a set of helpers to register and hotplug device memory as a new
region needing a struct page. This is offered through a very simple API:
struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
struct device *device,
@ -289,18 +300,19 @@ The hmm_devmem_ops is where most of the important things are:
};
The first callback (free()) happens when the last reference on a device page is
drop. This means the device page is now free and no longer use by anyone. The
second callback happens whenever CPU try to access a device page which it can
not do. This second callback must trigger a migration back to system memory.
dropped. This means the device page is now free and no longer used by anyone.
The second callback happens whenever the CPU tries to access a device page
which it cannot do. This second callback must trigger a migration back to
system memory.
-------------------------------------------------------------------------------
6) Migrate to and from device memory
6) Migration to and from device memory
Because CPU can not access device memory, migration must use device DMA engine
to perform copy from and to device memory. For this we need a new migration
helper:
Because the CPU cannot access device memory, migration must use the device DMA
engine to perform copy from and to device memory. For this we need a new
migration helper:
int migrate_vma(const struct migrate_vma_ops *ops,
struct vm_area_struct *vma,
@ -311,15 +323,15 @@ helper:
unsigned long *dst,
void *private);
Unlike other migration function it works on a range of virtual address, there
is two reasons for that. First device DMA copy has a high setup overhead cost
Unlike other migration functions it works on a range of virtual address, there
are two reasons for that. First, device DMA copy has a high setup overhead cost
and thus batching multiple pages is needed as otherwise the migration overhead
make the whole excersie pointless. The second reason is because driver trigger
such migration base on range of address the device is actively accessing.
makes the whole exercise pointless. The second reason is because the
migration might be for a range of addresses the device is actively accessing.
The migrate_vma_ops struct define two callbacks. First one (alloc_and_copy())
control destination memory allocation and copy operation. Second one is there
to allow device driver to perform cleanup operation after migration.
The migrate_vma_ops struct defines two callbacks. First one (alloc_and_copy())
controls destination memory allocation and copy operation. Second one is there
to allow the device driver to perform cleanup operations after migration.
struct migrate_vma_ops {
void (*alloc_and_copy)(struct vm_area_struct *vma,
@ -336,19 +348,19 @@ to allow device driver to perform cleanup operation after migration.
void *private);
};
It is important to stress that this migration helpers allow for hole in the
It is important to stress that these migration helpers allow for holes in the
virtual address range. Some pages in the range might not be migrated for all
the usual reasons (page is pin, page is lock, ...). This helper does not fail
but just skip over those pages.
the usual reasons (page is pinned, page is locked, ...). This helper does not
fail but just skips over those pages.
The alloc_and_copy() might as well decide to not migrate all pages in the
range (for reasons under the callback control). For those the callback just
have to leave the corresponding dst entry empty.
The alloc_and_copy() might decide to not migrate all pages in the
range (for reasons under the callback control). For those, the callback just
has to leave the corresponding dst entry empty.
Finaly the migration of the struct page might fails (for file back page) for
Finally, the migration of the struct page might fail (for file backed page) for
various reasons (failure to freeze reference, or update page cache, ...). If
that happens then the finalize_and_map() can catch any pages that was not
migrated. Note those page were still copied to new page and thus we wasted
that happens, then the finalize_and_map() can catch any pages that were not
migrated. Note those pages were still copied to a new page and thus we wasted
bandwidth but this is considered as a rare event and a price that we are
willing to pay to keep all the code simpler.
@ -358,27 +370,27 @@ willing to pay to keep all the code simpler.
7) Memory cgroup (memcg) and rss accounting
For now device memory is accounted as any regular page in rss counters (either
anonymous if device page is use for anonymous, file if device page is use for
file back page or shmem if device page is use for share memory). This is a
deliberate choice to keep existing application that might start using device
memory without knowing about it to keep runing unimpacted.
anonymous if device page is used for anonymous, file if device page is used for
file backed page or shmem if device page is used for shared memory). This is a
deliberate choice to keep existing applications, that might start using device
memory without knowing about it, running unimpacted.
Drawbacks is that OOM killer might kill an application using a lot of device
memory and not a lot of regular system memory and thus not freeing much system
memory. We want to gather more real world experience on how application and
system react under memory pressure in the presence of device memory before
A drawback is that the OOM killer might kill an application using a lot of
device memory and not a lot of regular system memory and thus not freeing much
system memory. We want to gather more real world experience on how applications
and system react under memory pressure in the presence of device memory before
deciding to account device memory differently.
Same decision was made for memory cgroup. Device memory page are accounted
Same decision was made for memory cgroup. Device memory pages are accounted
against same memory cgroup a regular page would be accounted to. This does
simplify migration to and from device memory. This also means that migration
back from device memory to regular memory can not fail because it would
back from device memory to regular memory cannot fail because it would
go above memory cgroup limit. We might revisit this choice latter on once we
get more experience in how device memory is use and its impact on memory
get more experience in how device memory is used and its impact on memory
resource control.
Note that device memory can never be pin nor by device driver nor through GUP
Note that device memory can never be pinned by device driver nor through GUP
and thus such memory is always free upon process exit. Or when last reference
is drop in case of share memory or file back memory.
is dropped in case of shared memory or file backed memory.

View File

@ -90,7 +90,7 @@ Steps:
1. Lock the page to be migrated
2. Insure that writeback is complete.
2. Ensure that writeback is complete.
3. Lock the new page that we want to move to. It is locked so that accesses to
this (not yet uptodate) page immediately lock while the move is in progress.
@ -100,8 +100,8 @@ Steps:
mapcount is not zero then we do not migrate the page. All user space
processes that attempt to access the page will now wait on the page lock.
5. The radix tree lock is taken. This will cause all processes trying
to access the page via the mapping to block on the radix tree spinlock.
5. The i_pages lock is taken. This will cause all processes trying
to access the page via the mapping to block on the spinlock.
6. The refcount of the page is examined and we back out if references remain
otherwise we know that we are the only one referencing this page.
@ -114,12 +114,12 @@ Steps:
9. The radix tree is changed to point to the new page.
10. The reference count of the old page is dropped because the radix tree
10. The reference count of the old page is dropped because the address space
reference is gone. A reference to the new page is established because
the new page is referenced to by the radix tree.
the new page is referenced by the address space.
11. The radix tree lock is dropped. With that lookups in the mapping
become possible again. Processes will move from spinning on the tree_lock
11. The i_pages lock is dropped. With that lookups in the mapping
become possible again. Processes will move from spinning on the lock
to sleeping on the locked new page.
12. The page contents are copied to the new page.

View File

@ -4392,7 +4392,7 @@ S: Maintained
F: drivers/staging/fsl-dpaa2/ethsw
DPT_I2O SCSI RAID DRIVER
M: Adaptec OEM Raid Solutions <aacraid@adaptec.com>
M: Adaptec OEM Raid Solutions <aacraid@microsemi.com>
L: linux-scsi@vger.kernel.org
W: http://www.adaptec.com/
S: Maintained
@ -6410,6 +6410,7 @@ L: linux-mm@kvack.org
S: Maintained
F: mm/hmm*
F: include/linux/hmm*
F: Documentation/vm/hmm.txt
HOST AP DRIVER
M: Jouni Malinen <j@w1.fi>
@ -7344,7 +7345,7 @@ F: include/linux/ipmi*
F: include/uapi/linux/ipmi*
IPS SCSI RAID DRIVER
M: Adaptec OEM Raid Solutions <aacraid@adaptec.com>
M: Adaptec OEM Raid Solutions <aacraid@microsemi.com>
L: linux-scsi@vger.kernel.org
W: http://www.adaptec.com/
S: Maintained
@ -11762,7 +11763,7 @@ F: drivers/char/random.c
RAPIDIO SUBSYSTEM
M: Matt Porter <mporter@kernel.crashing.org>
M: Alexandre Bounine <alexandre.bounine@idt.com>
M: Alexandre Bounine <alex.bou9@gmail.com>
S: Maintained
F: drivers/rapidio/

View File

@ -32,6 +32,7 @@
#define MAP_NONBLOCK 0x40000 /* do not block on IO */
#define MAP_STACK 0x80000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x100000 /* create a huge page mapping */
#define MAP_FIXED_NOREPLACE 0x200000/* MAP_FIXED which doesn't unmap underlying mapping */
#define MS_ASYNC 1 /* sync memory asynchronously */
#define MS_SYNC 2 /* synchronous memory sync */

View File

@ -318,10 +318,8 @@ static inline void flush_anon_page(struct vm_area_struct *vma,
#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
extern void flush_kernel_dcache_page(struct page *);
#define flush_dcache_mmap_lock(mapping) \
spin_lock_irq(&(mapping)->tree_lock)
#define flush_dcache_mmap_unlock(mapping) \
spin_unlock_irq(&(mapping)->tree_lock)
#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages)
#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages)
#define flush_icache_user_range(vma,page,addr,len) \
flush_dcache_page(page)

View File

@ -22,12 +22,6 @@
#include <mach/memory.h>
#endif
/*
* Allow for constants defined here to be used from assembly code
* by prepending the UL suffix only with actual C code compilation.
*/
#define UL(x) _AC(x, UL)
/* PAGE_OFFSET - the virtual address of the start of the kernel image */
#define PAGE_OFFSET UL(CONFIG_PAGE_OFFSET)

View File

@ -466,6 +466,12 @@ void __init dma_contiguous_early_fixup(phys_addr_t base, unsigned long size)
void __init dma_contiguous_remap(void)
{
int i;
if (!dma_mmu_remap_num)
return;
/* call flush_cache_all() since CMA area would be large enough */
flush_cache_all();
for (i = 0; i < dma_mmu_remap_num; i++) {
phys_addr_t start = dma_mmu_remap[i].base;
phys_addr_t end = start + dma_mmu_remap[i].size;
@ -498,7 +504,15 @@ void __init dma_contiguous_remap(void)
flush_tlb_kernel_range(__phys_to_virt(start),
__phys_to_virt(end));
iotable_init(&map, 1);
/*
* All the memory in CMA region will be on ZONE_MOVABLE.
* If that zone is considered as highmem, the memory in CMA
* region is also considered as highmem even if it's
* physical address belong to lowmem. In this case,
* re-mapping isn't required.
*/
if (!is_highmem_idx(ZONE_MOVABLE))
iotable_init(&map, 1);
}
}

View File

@ -21,20 +21,20 @@
#define MIN_GAP (128*1024*1024UL)
#define MAX_GAP ((TASK_SIZE)/6*5)
static int mmap_is_legacy(void)
static int mmap_is_legacy(struct rlimit *rlim_stack)
{
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
if (rlim_stack->rlim_cur == RLIM_INFINITY)
return 1;
return sysctl_legacy_va_layout;
}
static unsigned long mmap_base(unsigned long rnd)
static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
{
unsigned long gap = rlimit(RLIMIT_STACK);
unsigned long gap = rlim_stack->rlim_cur;
if (gap < MIN_GAP)
gap = MIN_GAP;
@ -180,18 +180,18 @@ unsigned long arch_mmap_rnd(void)
return rnd << PAGE_SHIFT;
}
void arch_pick_mmap_layout(struct mm_struct *mm)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
unsigned long random_factor = 0UL;
if (current->flags & PF_RANDOMIZE)
random_factor = arch_mmap_rnd();
if (mmap_is_legacy()) {
if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
mm->mmap_base = mmap_base(random_factor);
mm->mmap_base = mmap_base(random_factor, rlim_stack);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}

View File

@ -140,10 +140,8 @@ static inline void __flush_icache_all(void)
dsb(ish);
}
#define flush_dcache_mmap_lock(mapping) \
spin_lock_irq(&(mapping)->tree_lock)
#define flush_dcache_mmap_unlock(mapping) \
spin_unlock_irq(&(mapping)->tree_lock)
#define flush_dcache_mmap_lock(mapping) do { } while (0)
#define flush_dcache_mmap_unlock(mapping) do { } while (0)
/*
* We don't appear to need to do anything here. In fact, if we did, we'd

View File

@ -28,12 +28,6 @@
#include <asm/page-def.h>
#include <asm/sizes.h>
/*
* Allow for constants defined here to be used from assembly code
* by prepending the UL suffix only with actual C code compilation.
*/
#define UL(x) _AC(x, UL)
/*
* Size of the PCI I/O space. This must remain a power of two so that
* IO_SPACE_LIMIT acts as a mask for the low bits of I/O addresses.

View File

@ -38,12 +38,12 @@
#define MIN_GAP (SZ_128M)
#define MAX_GAP (STACK_TOP/6*5)
static int mmap_is_legacy(void)
static int mmap_is_legacy(struct rlimit *rlim_stack)
{
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
if (rlim_stack->rlim_cur == RLIM_INFINITY)
return 1;
return sysctl_legacy_va_layout;
@ -62,9 +62,9 @@ unsigned long arch_mmap_rnd(void)
return rnd << PAGE_SHIFT;
}
static unsigned long mmap_base(unsigned long rnd)
static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
{
unsigned long gap = rlimit(RLIMIT_STACK);
unsigned long gap = rlim_stack->rlim_cur;
unsigned long pad = (STACK_RND_MASK << PAGE_SHIFT) + stack_guard_gap;
/* Values close to RLIM_INFINITY can overflow. */
@ -83,7 +83,7 @@ static unsigned long mmap_base(unsigned long rnd)
* This function, called very early during the creation of a new process VM
* image, sets up which VM layout function to use:
*/
void arch_pick_mmap_layout(struct mm_struct *mm)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
unsigned long random_factor = 0UL;
@ -94,11 +94,11 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
* Fall back to the standard layout if the personality bit is set, or
* if the expected stack growth is unlimited:
*/
if (mmap_is_legacy()) {
if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
mm->mmap_base = mmap_base(random_factor);
mm->mmap_base = mmap_base(random_factor, rlim_stack);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}

View File

@ -50,6 +50,7 @@
#define MAP_NONBLOCK 0x20000 /* do not block on IO */
#define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x80000 /* create a huge page mapping */
#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
/*
* Flags for msync

View File

@ -24,20 +24,20 @@ EXPORT_SYMBOL(shm_align_mask);
#define MIN_GAP (128*1024*1024UL)
#define MAX_GAP ((TASK_SIZE)/6*5)
static int mmap_is_legacy(void)
static int mmap_is_legacy(struct rlimit *rlim_stack)
{
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
if (rlim_stack->rlim_cur == RLIM_INFINITY)
return 1;
return sysctl_legacy_va_layout;
}
static unsigned long mmap_base(unsigned long rnd)
static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
{
unsigned long gap = rlimit(RLIMIT_STACK);
unsigned long gap = rlim_stack->rlim_cur;
if (gap < MIN_GAP)
gap = MIN_GAP;
@ -158,18 +158,18 @@ unsigned long arch_mmap_rnd(void)
return rnd << PAGE_SHIFT;
}
void arch_pick_mmap_layout(struct mm_struct *mm)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
unsigned long random_factor = 0UL;
if (current->flags & PF_RANDOMIZE)
random_factor = arch_mmap_rnd();
if (mmap_is_legacy()) {
if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
mm->mmap_base = mmap_base(random_factor);
mm->mmap_base = mmap_base(random_factor, rlim_stack);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}

View File

@ -34,8 +34,8 @@ void flush_anon_page(struct vm_area_struct *vma,
void flush_kernel_dcache_page(struct page *page);
void flush_icache_range(unsigned long start, unsigned long end);
void flush_icache_page(struct vm_area_struct *vma, struct page *page);
#define flush_dcache_mmap_lock(mapping) spin_lock_irq(&(mapping)->tree_lock)
#define flush_dcache_mmap_unlock(mapping) spin_unlock_irq(&(mapping)->tree_lock)
#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&(mapping)->i_pages)
#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&(mapping)->i_pages)
#else
#include <asm-generic/cacheflush.h>

View File

@ -46,9 +46,7 @@ extern void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
extern void flush_dcache_range(unsigned long start, unsigned long end);
extern void invalidate_dcache_range(unsigned long start, unsigned long end);
#define flush_dcache_mmap_lock(mapping) \
spin_lock_irq(&(mapping)->tree_lock)
#define flush_dcache_mmap_unlock(mapping) \
spin_unlock_irq(&(mapping)->tree_lock)
#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages)
#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages)
#endif /* _ASM_NIOS2_CACHEFLUSH_H */

View File

@ -55,10 +55,8 @@ void invalidate_kernel_vmap_range(void *vaddr, int size);
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
extern void flush_dcache_page(struct page *page);
#define flush_dcache_mmap_lock(mapping) \
spin_lock_irq(&(mapping)->tree_lock)
#define flush_dcache_mmap_unlock(mapping) \
spin_unlock_irq(&(mapping)->tree_lock)
#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages)
#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages)
#define flush_icache_page(vma,page) do { \
flush_kernel_dcache_page(page); \

View File

@ -26,6 +26,7 @@
#define MAP_NONBLOCK 0x20000 /* do not block on IO */
#define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x80000 /* create a huge page mapping */
#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
#define MS_SYNC 1 /* synchronous memory sync */
#define MS_ASYNC 2 /* sync memory asynchronously */

View File

@ -70,12 +70,18 @@ static inline unsigned long COLOR_ALIGN(unsigned long addr,
* Top of mmap area (just below the process stack).
*/
static unsigned long mmap_upper_limit(void)
/*
* When called from arch_get_unmapped_area(), rlim_stack will be NULL,
* indicating that "current" should be used instead of a passed-in
* value from the exec bprm as done with arch_pick_mmap_layout().
*/
static unsigned long mmap_upper_limit(struct rlimit *rlim_stack)
{
unsigned long stack_base;
/* Limit stack size - see setup_arg_pages() in fs/exec.c */
stack_base = rlimit_max(RLIMIT_STACK);
stack_base = rlim_stack ? rlim_stack->rlim_max
: rlimit_max(RLIMIT_STACK);
if (stack_base > STACK_SIZE_MAX)
stack_base = STACK_SIZE_MAX;
@ -127,7 +133,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
info.flags = 0;
info.length = len;
info.low_limit = mm->mmap_legacy_base;
info.high_limit = mmap_upper_limit();
info.high_limit = mmap_upper_limit(NULL);
info.align_mask = last_mmap ? (PAGE_MASK & (SHM_COLOUR - 1)) : 0;
info.align_offset = shared_align_offset(last_mmap, pgoff);
addr = vm_unmapped_area(&info);
@ -250,10 +256,10 @@ static unsigned long mmap_legacy_base(void)
* This function, called very early during the creation of a new
* process VM image, sets up which VM layout function to use:
*/
void arch_pick_mmap_layout(struct mm_struct *mm)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
mm->mmap_legacy_base = mmap_legacy_base();
mm->mmap_base = mmap_upper_limit();
mm->mmap_base = mmap_upper_limit(rlim_stack);
if (mmap_is_legacy()) {
mm->mmap_base = mm->mmap_legacy_base;

View File

@ -39,12 +39,12 @@
#define MIN_GAP (128*1024*1024)
#define MAX_GAP (TASK_SIZE/6*5)
static inline int mmap_is_legacy(void)
static inline int mmap_is_legacy(struct rlimit *rlim_stack)
{
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
if (rlim_stack->rlim_cur == RLIM_INFINITY)
return 1;
return sysctl_legacy_va_layout;
@ -76,9 +76,10 @@ static inline unsigned long stack_maxrandom_size(void)
return (1<<30);
}
static inline unsigned long mmap_base(unsigned long rnd)
static inline unsigned long mmap_base(unsigned long rnd,
struct rlimit *rlim_stack)
{
unsigned long gap = rlimit(RLIMIT_STACK);
unsigned long gap = rlim_stack->rlim_cur;
unsigned long pad = stack_maxrandom_size() + stack_guard_gap;
/* Values close to RLIM_INFINITY can overflow. */
@ -196,26 +197,28 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
}
static void radix__arch_pick_mmap_layout(struct mm_struct *mm,
unsigned long random_factor)
unsigned long random_factor,
struct rlimit *rlim_stack)
{
if (mmap_is_legacy()) {
if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = radix__arch_get_unmapped_area;
} else {
mm->mmap_base = mmap_base(random_factor);
mm->mmap_base = mmap_base(random_factor, rlim_stack);
mm->get_unmapped_area = radix__arch_get_unmapped_area_topdown;
}
}
#else
/* dummy */
extern void radix__arch_pick_mmap_layout(struct mm_struct *mm,
unsigned long random_factor);
unsigned long random_factor,
struct rlimit *rlim_stack);
#endif
/*
* This function, called very early during the creation of a new
* process VM image, sets up which VM layout function to use:
*/
void arch_pick_mmap_layout(struct mm_struct *mm)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
unsigned long random_factor = 0UL;
@ -223,16 +226,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
random_factor = arch_mmap_rnd();
if (radix_enabled())
return radix__arch_pick_mmap_layout(mm, random_factor);
return radix__arch_pick_mmap_layout(mm, random_factor,
rlim_stack);
/*
* Fall back to the standard layout if the personality
* bit is set, or if the expected stack growth is unlimited:
*/
if (mmap_is_legacy()) {
if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
mm->mmap_base = mmap_base(random_factor);
mm->mmap_base = mmap_base(random_factor, rlim_stack);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}

View File

@ -75,8 +75,7 @@ EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
/*
* Taken from alloc_migrate_target with changes to remove CMA allocations
*/
struct page *new_iommu_non_cma_page(struct page *page, unsigned long private,
int **resultp)
struct page *new_iommu_non_cma_page(struct page *page, unsigned long private)
{
gfp_t gfp_mask = GFP_USER;
struct page *new_page;

View File

@ -37,11 +37,11 @@ static unsigned long stack_maxrandom_size(void)
#define MIN_GAP (32*1024*1024)
#define MAX_GAP (STACK_TOP/6*5)
static inline int mmap_is_legacy(void)
static inline int mmap_is_legacy(struct rlimit *rlim_stack)
{
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
if (rlim_stack->rlim_cur == RLIM_INFINITY)
return 1;
return sysctl_legacy_va_layout;
}
@ -56,9 +56,10 @@ static unsigned long mmap_base_legacy(unsigned long rnd)
return TASK_UNMAPPED_BASE + rnd;
}
static inline unsigned long mmap_base(unsigned long rnd)
static inline unsigned long mmap_base(unsigned long rnd,
struct rlimit *rlim_stack)
{
unsigned long gap = rlimit(RLIMIT_STACK);
unsigned long gap = rlim_stack->rlim_cur;
if (gap < MIN_GAP)
gap = MIN_GAP;
@ -184,7 +185,7 @@ check_asce_limit:
* This function, called very early during the creation of a new
* process VM image, sets up which VM layout function to use:
*/
void arch_pick_mmap_layout(struct mm_struct *mm)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
unsigned long random_factor = 0UL;
@ -195,11 +196,11 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
* Fall back to the standard layout if the personality
* bit is set, or if the expected stack growth is unlimited:
*/
if (mmap_is_legacy()) {
if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = mmap_base_legacy(random_factor);
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
mm->mmap_base = mmap_base(random_factor);
mm->mmap_base = mmap_base(random_factor, rlim_stack);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}

View File

@ -276,7 +276,7 @@ static unsigned long mmap_rnd(void)
return rnd << PAGE_SHIFT;
}
void arch_pick_mmap_layout(struct mm_struct *mm)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
unsigned long random_factor = mmap_rnd();
unsigned long gap;
@ -285,7 +285,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
* Fall back to the standard layout if the personality
* bit is set, or if the expected stack growth is unlimited:
*/
gap = rlimit(RLIMIT_STACK);
gap = rlim_stack->rlim_cur;
if (!test_thread_flag(TIF_32BIT) ||
(current->personality & ADDR_COMPAT_LAYOUT) ||
gap == RLIM_INFINITY ||

View File

@ -170,10 +170,8 @@ extern void flush_cache_page(struct vm_area_struct *vma,
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
extern void flush_dcache_page(struct page *);
#define flush_dcache_mmap_lock(mapping) \
spin_lock_irq(&(mapping)->tree_lock)
#define flush_dcache_mmap_unlock(mapping) \
spin_unlock_irq(&(mapping)->tree_lock)
#define flush_dcache_mmap_lock(mapping) do { } while (0)
#define flush_dcache_mmap_unlock(mapping) do { } while (0)
#define flush_icache_user_range(vma, page, addr, len) \
flush_dcache_page(page)

View File

@ -19,12 +19,6 @@
#include <asm/sizes.h>
#include <mach/memory.h>
/*
* Allow for constants defined here to be used from assembly code
* by prepending the UL suffix only with actual C code compilation.
*/
#define UL(x) _AC(x, UL)
/*
* PAGE_OFFSET - the virtual address of the start of the kernel image
* TASK_SIZE - the maximum size of a user space task.

View File

@ -122,12 +122,14 @@ struct x86_init_pci {
* @guest_late_init: guest late init
* @x2apic_available: X2APIC detection
* @init_mem_mapping: setup early mappings during init_mem_mapping()
* @init_after_bootmem: guest init after boot allocator is finished
*/
struct x86_hyper_init {
void (*init_platform)(void);
void (*guest_late_init)(void);
bool (*x2apic_available)(void);
void (*init_mem_mapping)(void);
void (*init_after_bootmem)(void);
};
/**

View File

@ -92,6 +92,7 @@ struct x86_init_ops x86_init __initdata = {
.guest_late_init = x86_init_noop,
.x2apic_available = bool_x86_init_noop,
.init_mem_mapping = x86_init_noop,
.init_after_bootmem = x86_init_noop,
},
.acpi = {

View File

@ -778,6 +778,7 @@ void __init mem_init(void)
free_all_bootmem();
after_bootmem = 1;
x86_init.hyper.init_after_bootmem();
mem_init_print_info(NULL);
printk(KERN_INFO "virtual kernel memory layout:\n"

View File

@ -1185,6 +1185,7 @@ void __init mem_init(void)
/* this will put all memory onto the freelists */
free_all_bootmem();
after_bootmem = 1;
x86_init.hyper.init_after_bootmem();
/*
* Must be done after boot memory is put on freelist, because here we

View File

@ -90,9 +90,10 @@ unsigned long arch_mmap_rnd(void)
return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits);
}
static unsigned long mmap_base(unsigned long rnd, unsigned long task_size)
static unsigned long mmap_base(unsigned long rnd, unsigned long task_size,
struct rlimit *rlim_stack)
{
unsigned long gap = rlimit(RLIMIT_STACK);
unsigned long gap = rlim_stack->rlim_cur;
unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap;
unsigned long gap_min, gap_max;
@ -126,16 +127,17 @@ static unsigned long mmap_legacy_base(unsigned long rnd,
* process VM image, sets up which VM layout function to use:
*/
static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base,
unsigned long random_factor, unsigned long task_size)
unsigned long random_factor, unsigned long task_size,
struct rlimit *rlim_stack)
{
*legacy_base = mmap_legacy_base(random_factor, task_size);
if (mmap_is_legacy())
*base = *legacy_base;
else
*base = mmap_base(random_factor, task_size);
*base = mmap_base(random_factor, task_size, rlim_stack);
}
void arch_pick_mmap_layout(struct mm_struct *mm)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
if (mmap_is_legacy())
mm->get_unmapped_area = arch_get_unmapped_area;
@ -143,7 +145,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
arch_rnd(mmap64_rnd_bits), task_size_64bit(0));
arch_rnd(mmap64_rnd_bits), task_size_64bit(0),
rlim_stack);
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
/*
@ -153,7 +156,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
* mmap_base, the compat syscall uses mmap_compat_base.
*/
arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base,
arch_rnd(mmap32_rnd_bits), task_size_32bit());
arch_rnd(mmap32_rnd_bits), task_size_32bit(),
rlim_stack);
#endif
}

View File

@ -116,6 +116,8 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
static phys_addr_t xen_pt_base, xen_pt_size __initdata;
static DEFINE_STATIC_KEY_FALSE(xen_struct_pages_ready);
/*
* Just beyond the highest usermode address. STACK_TOP_MAX has a
* redzone above it, so round it up to a PGD boundary.
@ -155,11 +157,18 @@ void make_lowmem_page_readwrite(void *vaddr)
}
/*
* During early boot all page table pages are pinned, but we do not have struct
* pages, so return true until struct pages are ready.
*/
static bool xen_page_pinned(void *ptr)
{
struct page *page = virt_to_page(ptr);
if (static_branch_likely(&xen_struct_pages_ready)) {
struct page *page = virt_to_page(ptr);
return PagePinned(page);
return PagePinned(page);
}
return true;
}
static void xen_extend_mmu_update(const struct mmu_update *update)
@ -836,11 +845,6 @@ void xen_mm_pin_all(void)
spin_unlock(&pgd_lock);
}
/*
* The init_mm pagetable is really pinned as soon as its created, but
* that's before we have page structures to store the bits. So do all
* the book-keeping now.
*/
static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
enum pt_level level)
{
@ -848,8 +852,18 @@ static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
return 0;
}
static void __init xen_mark_init_mm_pinned(void)
/*
* The init_mm pagetable is really pinned as soon as its created, but
* that's before we have page structures to store the bits. So do all
* the book-keeping now once struct pages for allocated pages are
* initialized. This happens only after free_all_bootmem() is called.
*/
static void __init xen_after_bootmem(void)
{
static_branch_enable(&xen_struct_pages_ready);
#ifdef CONFIG_X86_64
SetPagePinned(virt_to_page(level3_user_vsyscall));
#endif
xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
}
@ -1623,14 +1637,15 @@ static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
unsigned level)
{
bool pinned = PagePinned(virt_to_page(mm->pgd));
bool pinned = xen_page_pinned(mm->pgd);
trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
if (pinned) {
struct page *page = pfn_to_page(pfn);
SetPagePinned(page);
if (static_branch_likely(&xen_struct_pages_ready))
SetPagePinned(page);
if (!PageHighMem(page)) {
xen_mc_batch();
@ -2364,9 +2379,7 @@ static void __init xen_post_allocator_init(void)
#ifdef CONFIG_X86_64
pv_mmu_ops.write_cr3 = &xen_write_cr3;
SetPagePinned(virt_to_page(level3_user_vsyscall));
#endif
xen_mark_init_mm_pinned();
}
static void xen_leave_lazy_mmu(void)
@ -2450,6 +2463,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
void __init xen_init_mmu_ops(void)
{
x86_init.paging.pagetable_init = xen_pagetable_init;
x86_init.hyper.init_after_bootmem = xen_after_bootmem;
pv_mmu_ops = xen_mmu_ops;

View File

@ -57,6 +57,7 @@
#define MAP_NONBLOCK 0x20000 /* do not block on IO */
#define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x80000 /* create a huge page mapping */
#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
# define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be
* uninitialized */

View File

@ -837,11 +837,8 @@ int __init memory_dev_init(void)
* during boot and have been initialized
*/
mutex_lock(&mem_sysfs_mutex);
for (i = 0; i < NR_MEM_SECTIONS; i += sections_per_block) {
/* Don't iterate over sections we know are !present: */
if (i > __highest_present_section_nr)
break;
for (i = 0; i <= __highest_present_section_nr;
i += sections_per_block) {
err = add_memory_block(i);
if (!ret)
ret = err;

View File

@ -253,7 +253,7 @@ static inline void hwsim_clear_chanctx_magic(struct ieee80211_chanctx_conf *c)
static unsigned int hwsim_net_id;
static struct ida hwsim_netgroup_ida = IDA_INIT;
static DEFINE_IDA(hwsim_netgroup_ida);
struct hwsim_net {
int netgroup;

View File

@ -295,7 +295,7 @@ static void __init of_unittest_printf(void)
return;
}
num_to_str(phandle_str, sizeof(phandle_str), np->phandle);
num_to_str(phandle_str, sizeof(phandle_str), np->phandle, 0);
of_unittest_printf_one(np, "%pOF", full_name);
of_unittest_printf_one(np, "%pOFf", full_name);

View File

@ -212,7 +212,6 @@ struct mport_cdev_priv {
#ifdef CONFIG_RAPIDIO_DMA_ENGINE
struct dma_chan *dmach;
struct list_head async_list;
struct list_head pend_list;
spinlock_t req_lock;
struct mutex dma_lock;
struct kref dma_ref;
@ -258,8 +257,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mport_cdev_wait);
static struct class *dev_class;
static dev_t dev_number;
static struct workqueue_struct *dma_wq;
static void mport_release_mapping(struct kref *ref);
static int rio_mport_maint_rd(struct mport_cdev_priv *priv, void __user *arg,
@ -539,6 +536,7 @@ static int maint_comptag_set(struct mport_cdev_priv *priv, void __user *arg)
#ifdef CONFIG_RAPIDIO_DMA_ENGINE
struct mport_dma_req {
struct kref refcount;
struct list_head node;
struct file *filp;
struct mport_cdev_priv *priv;
@ -554,11 +552,6 @@ struct mport_dma_req {
struct completion req_comp;
};
struct mport_faf_work {
struct work_struct work;
struct mport_dma_req *req;
};
static void mport_release_def_dma(struct kref *dma_ref)
{
struct mport_dev *md =
@ -578,8 +571,10 @@ static void mport_release_dma(struct kref *dma_ref)
complete(&priv->comp);
}
static void dma_req_free(struct mport_dma_req *req)
static void dma_req_free(struct kref *ref)
{
struct mport_dma_req *req = container_of(ref, struct mport_dma_req,
refcount);
struct mport_cdev_priv *priv = req->priv;
unsigned int i;
@ -611,30 +606,7 @@ static void dma_xfer_callback(void *param)
req->status = dma_async_is_tx_complete(priv->dmach, req->cookie,
NULL, NULL);
complete(&req->req_comp);
}
static void dma_faf_cleanup(struct work_struct *_work)
{
struct mport_faf_work *work = container_of(_work,
struct mport_faf_work, work);
struct mport_dma_req *req = work->req;
dma_req_free(req);
kfree(work);
}
static void dma_faf_callback(void *param)
{
struct mport_dma_req *req = (struct mport_dma_req *)param;
struct mport_faf_work *work;
work = kmalloc(sizeof(*work), GFP_ATOMIC);
if (!work)
return;
INIT_WORK(&work->work, dma_faf_cleanup);
work->req = req;
queue_work(dma_wq, &work->work);
kref_put(&req->refcount, dma_req_free);
}
/*
@ -765,16 +737,14 @@ static int do_dma_request(struct mport_dma_req *req,
goto err_out;
}
if (sync == RIO_TRANSFER_FAF)
tx->callback = dma_faf_callback;
else
tx->callback = dma_xfer_callback;
tx->callback = dma_xfer_callback;
tx->callback_param = req;
req->dmach = chan;
req->sync = sync;
req->status = DMA_IN_PROGRESS;
init_completion(&req->req_comp);
kref_get(&req->refcount);
cookie = dmaengine_submit(tx);
req->cookie = cookie;
@ -785,6 +755,7 @@ static int do_dma_request(struct mport_dma_req *req,
if (dma_submit_error(cookie)) {
rmcd_error("submit err=%d (addr:0x%llx len:0x%llx)",
cookie, xfer->rio_addr, xfer->length);
kref_put(&req->refcount, dma_req_free);
ret = -EIO;
goto err_out;
}
@ -860,6 +831,8 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode,
if (!req)
return -ENOMEM;
kref_init(&req->refcount);
ret = get_dma_channel(priv);
if (ret) {
kfree(req);
@ -968,42 +941,20 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode,
ret = do_dma_request(req, xfer, sync, nents);
if (ret >= 0) {
if (sync == RIO_TRANSFER_SYNC)
goto sync_out;
return ret; /* return ASYNC cookie */
if (sync == RIO_TRANSFER_ASYNC)
return ret; /* return ASYNC cookie */
} else {
rmcd_debug(DMA, "do_dma_request failed with err=%d", ret);
}
if (ret == -ETIMEDOUT || ret == -EINTR) {
/*
* This can happen only in case of SYNC transfer.
* Do not free unfinished request structure immediately.
* Place it into pending list and deal with it later
*/
spin_lock(&priv->req_lock);
list_add_tail(&req->node, &priv->pend_list);
spin_unlock(&priv->req_lock);
return ret;
}
rmcd_debug(DMA, "do_dma_request failed with err=%d", ret);
sync_out:
dma_unmap_sg(chan->device->dev, req->sgt.sgl, req->sgt.nents, dir);
sg_free_table(&req->sgt);
err_pg:
if (page_list) {
if (!req->page_list) {
for (i = 0; i < nr_pages; i++)
put_page(page_list[i]);
kfree(page_list);
}
err_req:
if (req->map) {
mutex_lock(&md->buf_mutex);
kref_put(&req->map->ref, mport_release_mapping);
mutex_unlock(&md->buf_mutex);
}
put_dma_channel(priv);
kfree(req);
kref_put(&req->refcount, dma_req_free);
return ret;
}
@ -1121,7 +1072,7 @@ static int rio_mport_wait_for_async_dma(struct file *filp, void __user *arg)
ret = 0;
if (req->status != DMA_IN_PROGRESS && req->status != DMA_PAUSED)
dma_req_free(req);
kref_put(&req->refcount, dma_req_free);
return ret;
@ -1966,7 +1917,6 @@ static int mport_cdev_open(struct inode *inode, struct file *filp)
#ifdef CONFIG_RAPIDIO_DMA_ENGINE
INIT_LIST_HEAD(&priv->async_list);
INIT_LIST_HEAD(&priv->pend_list);
spin_lock_init(&priv->req_lock);
mutex_init(&priv->dma_lock);
#endif
@ -2006,8 +1956,6 @@ static void mport_cdev_release_dma(struct file *filp)
md = priv->md;
flush_workqueue(dma_wq);
spin_lock(&priv->req_lock);
if (!list_empty(&priv->async_list)) {
rmcd_debug(EXIT, "async list not empty filp=%p %s(%d)",
@ -2023,20 +1971,7 @@ static void mport_cdev_release_dma(struct file *filp)
req->filp, req->cookie,
completion_done(&req->req_comp)?"yes":"no");
list_del(&req->node);
dma_req_free(req);
}
}
if (!list_empty(&priv->pend_list)) {
rmcd_debug(EXIT, "Free pending DMA requests for filp=%p %s(%d)",
filp, current->comm, task_pid_nr(current));
list_for_each_entry_safe(req,
req_next, &priv->pend_list, node) {
rmcd_debug(EXIT, "free req->filp=%p cookie=%d compl=%s",
req->filp, req->cookie,
completion_done(&req->req_comp)?"yes":"no");
list_del(&req->node);
dma_req_free(req);
kref_put(&req->refcount, dma_req_free);
}
}
@ -2048,15 +1983,6 @@ static void mport_cdev_release_dma(struct file *filp)
current->comm, task_pid_nr(current), wret);
}
spin_lock(&priv->req_lock);
if (!list_empty(&priv->pend_list)) {
rmcd_debug(EXIT, "ATTN: pending DMA requests, filp=%p %s(%d)",
filp, current->comm, task_pid_nr(current));
}
spin_unlock(&priv->req_lock);
if (priv->dmach != priv->md->dma_chan) {
rmcd_debug(EXIT, "Release DMA channel for filp=%p %s(%d)",
filp, current->comm, task_pid_nr(current));
@ -2573,8 +2499,6 @@ static void mport_cdev_remove(struct mport_dev *md)
cdev_device_del(&md->cdev, &md->dev);
mport_cdev_kill_fasync(md);
flush_workqueue(dma_wq);
/* TODO: do we need to give clients some time to close file
* descriptors? Simple wait for XX, or kref?
*/
@ -2691,17 +2615,8 @@ static int __init mport_init(void)
goto err_cli;
}
dma_wq = create_singlethread_workqueue("dma_wq");
if (!dma_wq) {
rmcd_error("failed to create DMA work queue");
ret = -ENOMEM;
goto err_wq;
}
return 0;
err_wq:
class_interface_unregister(&rio_mport_interface);
err_cli:
unregister_chrdev_region(dev_number, RIO_MAX_MPORTS);
err_chr:
@ -2717,7 +2632,6 @@ static void __exit mport_exit(void)
class_interface_unregister(&rio_mport_interface);
class_destroy(dev_class);
unregister_chrdev_region(dev_number, RIO_MAX_MPORTS);
destroy_workqueue(dma_wq);
}
module_init(mport_init);

View File

@ -76,7 +76,7 @@ static u16 rio_destid_alloc(struct rio_net *net)
}
/**
* rio_destid_reserve - Reserve the specivied destID
* rio_destid_reserve - Reserve the specified destID
* @net: RIO network
* @destid: destID to reserve
*
@ -885,7 +885,7 @@ static struct rio_net *rio_scan_alloc_net(struct rio_mport *mport,
*
* For each enumerated device, ensure that each switch in a system
* has correct routing entries. Add routes for devices that where
* unknown dirung the first enumeration pass through the switch.
* unknown during the first enumeration pass through the switch.
*/
static void rio_update_route_tables(struct rio_net *net)
{
@ -983,7 +983,7 @@ static int rio_enum_mport(struct rio_mport *mport, u32 flags)
/* reserve mport destID in new net */
rio_destid_reserve(net, mport->host_deviceid);
/* Enable Input Output Port (transmitter reviever) */
/* Enable Input Output Port (transmitter receiver) */
rio_enable_rx_tx_port(mport, 1, 0, 0, 0);
/* Set component tag for host */

View File

@ -69,7 +69,7 @@ blkcnt_t dirty_cnt(struct inode *inode)
void *results[1];
if (inode->i_mapping)
cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->page_tree,
cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->i_pages,
results, 0, 1,
PAGECACHE_TAG_DIRTY);
if (cnt == 0 && atomic_read(&vob->vob_mmap_cnt) > 0)

View File

@ -934,14 +934,14 @@ static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash,
struct page *page;
int found;
spin_lock_irq(&mapping->tree_lock);
found = radix_tree_gang_lookup(&mapping->page_tree,
xa_lock_irq(&mapping->i_pages);
found = radix_tree_gang_lookup(&mapping->i_pages,
(void **)&page, offset, 1);
if (found > 0 && !radix_tree_exceptional_entry(page)) {
struct lu_dirpage *dp;
get_page(page);
spin_unlock_irq(&mapping->tree_lock);
xa_unlock_irq(&mapping->i_pages);
/*
* In contrast to find_lock_page() we are sure that directory
* page cannot be truncated (while DLM lock is held) and,
@ -989,7 +989,7 @@ static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash,
page = ERR_PTR(-EIO);
}
} else {
spin_unlock_irq(&mapping->tree_lock);
xa_unlock_irq(&mapping->i_pages);
page = NULL;
}
return page;

View File

@ -570,10 +570,11 @@ static int afs_writepages_region(struct address_space *mapping,
_debug("wback %lx", page->index);
/* at this point we hold neither mapping->tree_lock nor lock on
* the page itself: the page may be truncated or invalidated
* (changing page->mapping to NULL), or even swizzled back from
* swapper_space to tmpfs file mapping
/*
* at this point we hold neither the i_pages lock nor the
* page lock: the page may be truncated or invalidated
* (changing page->mapping to NULL), or even swizzled
* back from swapper_space to tmpfs file mapping
*/
ret = lock_page_killable(page);
if (ret < 0) {

View File

@ -19,9 +19,6 @@
*/
static autofs_wqt_t autofs4_next_wait_queue = 1;
/* These are the signals we allow interrupting a pending mount */
#define SHUTDOWN_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT))
void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
{
struct autofs_wait_queue *wq, *nwq;
@ -486,29 +483,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
* wq->name.name is NULL iff the lock is already released
* or the mount has been made catatonic.
*/
if (wq->name.name) {
/* Block all but "shutdown" signals while waiting */
unsigned long shutdown_sigs_mask;
unsigned long irqflags;
sigset_t oldset;
spin_lock_irqsave(&current->sighand->siglock, irqflags);
oldset = current->blocked;
shutdown_sigs_mask = SHUTDOWN_SIGS & ~oldset.sig[0];
siginitsetinv(&current->blocked, shutdown_sigs_mask);
recalc_sigpending();
spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
wait_event_interruptible(wq->queue, wq->name.name == NULL);
spin_lock_irqsave(&current->sighand->siglock, irqflags);
current->blocked = oldset;
recalc_sigpending();
spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
} else {
pr_debug("skipped sleeping\n");
}
wait_event_killable(wq->queue, wq->name.name == NULL);
status = wq->status;
/*
@ -574,7 +549,7 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok
kfree(wq->name.name);
wq->name.name = NULL; /* Do not wait on this queue */
wq->status = status;
wake_up_interruptible(&wq->queue);
wake_up(&wq->queue);
if (!--wq->wait_ctr)
kfree(wq);
mutex_unlock(&sbi->wq_mutex);

View File

@ -330,6 +330,7 @@ beyond_if:
#ifdef __alpha__
regs->gp = ex.a_gpvalue;
#endif
finalize_exec(bprm);
start_thread(regs, ex.a_entry, current->mm->start_stack);
return 0;
}

View File

@ -377,6 +377,11 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
} else
map_addr = vm_mmap(filep, addr, size, prot, type, off);
if ((type & MAP_FIXED_NOREPLACE) && BAD_ADDR(map_addr))
pr_info("%d (%s): Uhuuh, elf segment at %p requested but the memory is mapped already\n",
task_pid_nr(current), current->comm,
(void *)addr);
return(map_addr);
}
@ -575,7 +580,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
elf_prot |= PROT_EXEC;
vaddr = eppnt->p_vaddr;
if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
elf_type |= MAP_FIXED;
elf_type |= MAP_FIXED_NOREPLACE;
else if (no_base && interp_elf_ex->e_type == ET_DYN)
load_addr = -vaddr;
@ -890,7 +895,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
the correct location in memory. */
for(i = 0, elf_ppnt = elf_phdata;
i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
int elf_prot = 0, elf_flags;
int elf_prot = 0, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE;
unsigned long k, vaddr;
unsigned long total_size = 0;
@ -922,6 +927,13 @@ static int load_elf_binary(struct linux_binprm *bprm)
*/
}
}
/*
* Some binaries have overlapping elf segments and then
* we have to forcefully map over an existing mapping
* e.g. over this newly established brk mapping.
*/
elf_fixed = MAP_FIXED;
}
if (elf_ppnt->p_flags & PF_R)
@ -939,7 +951,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
* the ET_DYN load_addr calculations, proceed normally.
*/
if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
elf_flags |= MAP_FIXED;
elf_flags |= elf_fixed;
} else if (loc->elf_ex.e_type == ET_DYN) {
/*
* This logic is run once for the first LOAD Program
@ -975,7 +987,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
load_bias = ELF_ET_DYN_BASE;
if (current->flags & PF_RANDOMIZE)
load_bias += arch_mmap_rnd();
elf_flags |= MAP_FIXED;
elf_flags |= elf_fixed;
} else
load_bias = 0;
@ -1155,6 +1167,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
ELF_PLAT_INIT(regs, reloc_func_desc);
#endif
finalize_exec(bprm);
start_thread(regs, elf_entry, bprm->p);
retval = 0;
out:
@ -1234,7 +1247,7 @@ static int load_elf_library(struct file *file)
(eppnt->p_filesz +
ELF_PAGEOFFSET(eppnt->p_vaddr)),
PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_DENYWRITE,
(eppnt->p_offset -
ELF_PAGEOFFSET(eppnt->p_vaddr)));
if (error != ELF_PAGESTART(eppnt->p_vaddr))

View File

@ -463,6 +463,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
dynaddr);
#endif
finalize_exec(bprm);
/* everything is now ready... get the userspace context ready to roll */
entryaddr = interp_params.entry_addr ?: exec_params.entry_addr;
start_thread(regs, entryaddr, current->mm->start_stack);

View File

@ -994,6 +994,7 @@ static int load_flat_binary(struct linux_binprm *bprm)
FLAT_PLAT_INIT(regs);
#endif
finalize_exec(bprm);
pr_debug("start_thread(regs=0x%p, entry=0x%lx, start_stack=0x%lx)\n",
regs, start_addr, current->mm->start_stack);
start_thread(regs, start_addr, current->mm->start_stack);

View File

@ -458,7 +458,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
break;
rcu_read_lock();
page = radix_tree_lookup(&mapping->page_tree, pg_index);
page = radix_tree_lookup(&mapping->i_pages, pg_index);
rcu_read_unlock();
if (page && !radix_tree_exceptional_entry(page)) {
misses++;

View File

@ -3963,11 +3963,11 @@ retry:
done_index = page->index;
/*
* At this point we hold neither mapping->tree_lock nor
* lock on the page itself: the page may be truncated or
* invalidated (changing page->mapping to NULL), or even
* swizzled back from swapper_space to tmpfs file
* mapping
* At this point we hold neither the i_pages lock nor
* the page lock: the page may be truncated or
* invalidated (changing page->mapping to NULL),
* or even swizzled back from swapper_space to
* tmpfs file mapping
*/
if (!trylock_page(page)) {
flush_write_bio(epd);
@ -5174,13 +5174,13 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(!PagePrivate(page));
clear_page_dirty_for_io(page);
spin_lock_irq(&page->mapping->tree_lock);
xa_lock_irq(&page->mapping->i_pages);
if (!PageDirty(page)) {
radix_tree_tag_clear(&page->mapping->page_tree,
radix_tree_tag_clear(&page->mapping->i_pages,
page_index(page),
PAGECACHE_TAG_DIRTY);
}
spin_unlock_irq(&page->mapping->tree_lock);
xa_unlock_irq(&page->mapping->i_pages);
ClearPageError(page);
unlock_page(page);
}

View File

@ -185,10 +185,9 @@ EXPORT_SYMBOL(end_buffer_write_sync);
* we get exclusion from try_to_free_buffers with the blockdev mapping's
* private_lock.
*
* Hack idea: for the blockdev mapping, i_bufferlist_lock contention
* Hack idea: for the blockdev mapping, private_lock contention
* may be quite high. This code could TryLock the page, and if that
* succeeds, there is no need to take private_lock. (But if
* private_lock is contended then so is mapping->tree_lock).
* succeeds, there is no need to take private_lock.
*/
static struct buffer_head *
__find_get_block_slow(struct block_device *bdev, sector_t block)
@ -594,20 +593,21 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
*
* The caller must hold lock_page_memcg().
*/
static void __set_page_dirty(struct page *page, struct address_space *mapping,
void __set_page_dirty(struct page *page, struct address_space *mapping,
int warn)
{
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
xa_lock_irqsave(&mapping->i_pages, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
radix_tree_tag_set(&mapping->i_pages,
page_index(page), PAGECACHE_TAG_DIRTY);
}
spin_unlock_irqrestore(&mapping->tree_lock, flags);
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
EXPORT_SYMBOL_GPL(__set_page_dirty);
/*
* Add a page to the dirty page list.
@ -1095,7 +1095,7 @@ __getblk_slow(struct block_device *bdev, sector_t block,
* inode list.
*
* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
* mapping->tree_lock and mapping->host->i_lock.
* i_pages lock and mapping->host->i_lock.
*/
void mark_buffer_dirty(struct buffer_head *bh)
{

View File

@ -1987,11 +1987,10 @@ wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages,
for (i = 0; i < found_pages; i++) {
page = wdata->pages[i];
/*
* At this point we hold neither mapping->tree_lock nor
* lock on the page itself: the page may be truncated or
* invalidated (changing page->mapping to NULL), or even
* swizzled back from swapper_space to tmpfs file
* mapping
* At this point we hold neither the i_pages lock nor the
* page lock: the page may be truncated or invalidated
* (changing page->mapping to NULL), or even swizzled
* back from swapper_space to tmpfs file mapping
*/
if (nr_pages == 0)

124
fs/dax.c
View File

@ -158,11 +158,9 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo
}
/*
* We do not necessarily hold the mapping->tree_lock when we call this
* function so it is possible that 'entry' is no longer a valid item in the
* radix tree. This is okay because all we really need to do is to find the
* correct waitqueue where tasks might be waiting for that old 'entry' and
* wake them.
* @entry may no longer be the entry at the index in the mapping.
* The important information it's conveying is whether the entry at
* this index used to be a PMD entry.
*/
static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
pgoff_t index, void *entry, bool wake_all)
@ -174,7 +172,7 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
/*
* Checking for locked entry and prepare_to_wait_exclusive() happens
* under mapping->tree_lock, ditto for entry handling in our callers.
* under the i_pages lock, ditto for entry handling in our callers.
* So at this point all tasks that could have seen our entry locked
* must be in the waitqueue and the following check will see them.
*/
@ -183,41 +181,39 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
}
/*
* Check whether the given slot is locked. The function must be called with
* mapping->tree_lock held
* Check whether the given slot is locked. Must be called with the i_pages
* lock held.
*/
static inline int slot_locked(struct address_space *mapping, void **slot)
{
unsigned long entry = (unsigned long)
radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
return entry & RADIX_DAX_ENTRY_LOCK;
}
/*
* Mark the given slot is locked. The function must be called with
* mapping->tree_lock held
* Mark the given slot as locked. Must be called with the i_pages lock held.
*/
static inline void *lock_slot(struct address_space *mapping, void **slot)
{
unsigned long entry = (unsigned long)
radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
entry |= RADIX_DAX_ENTRY_LOCK;
radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
return (void *)entry;
}
/*
* Mark the given slot is unlocked. The function must be called with
* mapping->tree_lock held
* Mark the given slot as unlocked. Must be called with the i_pages lock held.
*/
static inline void *unlock_slot(struct address_space *mapping, void **slot)
{
unsigned long entry = (unsigned long)
radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
return (void *)entry;
}
@ -228,7 +224,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot)
* put_locked_mapping_entry() when he locked the entry and now wants to
* unlock it.
*
* The function must be called with mapping->tree_lock held.
* Must be called with the i_pages lock held.
*/
static void *get_unlocked_mapping_entry(struct address_space *mapping,
pgoff_t index, void ***slotp)
@ -241,7 +237,7 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
ewait.wait.func = wake_exceptional_entry_func;
for (;;) {
entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
entry = __radix_tree_lookup(&mapping->i_pages, index, NULL,
&slot);
if (!entry ||
WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
@ -254,10 +250,10 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
prepare_to_wait_exclusive(wq, &ewait.wait,
TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&mapping->tree_lock);
xa_unlock_irq(&mapping->i_pages);
schedule();
finish_wait(wq, &ewait.wait);
spin_lock_irq(&mapping->tree_lock);
xa_lock_irq(&mapping->i_pages);
}
}
@ -266,15 +262,15 @@ static void dax_unlock_mapping_entry(struct address_space *mapping,
{
void *entry, **slot;
spin_lock_irq(&mapping->tree_lock);
entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
xa_lock_irq(&mapping->i_pages);
entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot);
if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
!slot_locked(mapping, slot))) {
spin_unlock_irq(&mapping->tree_lock);
xa_unlock_irq(&mapping->i_pages);
return;
}
unlock_slot(mapping, slot);
spin_unlock_irq(&mapping->tree_lock);
xa_unlock_irq(&mapping->i_pages);
dax_wake_mapping_entry_waiter(mapping, index, entry, false);
}
@ -388,7 +384,7 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
void *entry, **slot;
restart:
spin_lock_irq(&mapping->tree_lock);
xa_lock_irq(&mapping->i_pages);
entry = get_unlocked_mapping_entry(mapping, index, &slot);
if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
@ -420,12 +416,12 @@ restart:
if (pmd_downgrade) {
/*
* Make sure 'entry' remains valid while we drop
* mapping->tree_lock.
* the i_pages lock.
*/
entry = lock_slot(mapping, slot);
}
spin_unlock_irq(&mapping->tree_lock);
xa_unlock_irq(&mapping->i_pages);
/*
* Besides huge zero pages the only other thing that gets
* downgraded are empty entries which don't need to be
@ -442,27 +438,27 @@ restart:
put_locked_mapping_entry(mapping, index);
return ERR_PTR(err);
}
spin_lock_irq(&mapping->tree_lock);
xa_lock_irq(&mapping->i_pages);
if (!entry) {
/*
* We needed to drop the page_tree lock while calling
* We needed to drop the i_pages lock while calling
* radix_tree_preload() and we didn't have an entry to
* lock. See if another thread inserted an entry at
* our index during this time.
*/
entry = __radix_tree_lookup(&mapping->page_tree, index,
entry = __radix_tree_lookup(&mapping->i_pages, index,
NULL, &slot);
if (entry) {
radix_tree_preload_end();
spin_unlock_irq(&mapping->tree_lock);
xa_unlock_irq(&mapping->i_pages);
goto restart;
}
}
if (pmd_downgrade) {
dax_disassociate_entry(entry, mapping, false);
radix_tree_delete(&mapping->page_tree, index);
radix_tree_delete(&mapping->i_pages, index);
mapping->nrexceptional--;
dax_wake_mapping_entry_waiter(mapping, index, entry,
true);
@ -470,11 +466,11 @@ restart:
entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
err = __radix_tree_insert(&mapping->page_tree, index,
err = __radix_tree_insert(&mapping->i_pages, index,
dax_radix_order(entry), entry);
radix_tree_preload_end();
if (err) {
spin_unlock_irq(&mapping->tree_lock);
xa_unlock_irq(&mapping->i_pages);
/*
* Our insertion of a DAX entry failed, most likely
* because we were inserting a PMD entry and it
@ -487,12 +483,12 @@ restart:
}
/* Good, we have inserted empty locked entry into the tree. */
mapping->nrexceptional++;
spin_unlock_irq(&mapping->tree_lock);
xa_unlock_irq(&mapping->i_pages);
return entry;
}
entry = lock_slot(mapping, slot);
out_unlock:
spin_unlock_irq(&mapping->tree_lock);
xa_unlock_irq(&mapping->i_pages);
return entry;
}
@ -501,23 +497,23 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
{
int ret = 0;
void *entry;
struct radix_tree_root *page_tree = &mapping->page_tree;
struct radix_tree_root *pages = &mapping->i_pages;
spin_lock_irq(&mapping->tree_lock);
xa_lock_irq(pages);
entry = get_unlocked_mapping_entry(mapping, index, NULL);
if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
goto out;
if (!trunc &&
(radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)))
(radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) ||
radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)))
goto out;
dax_disassociate_entry(entry, mapping, trunc);
radix_tree_delete(page_tree, index);
radix_tree_delete(pages, index);
mapping->nrexceptional--;
ret = 1;
out:
put_unlocked_mapping_entry(mapping, index, entry);
spin_unlock_irq(&mapping->tree_lock);
xa_unlock_irq(pages);
return ret;
}
/*
@ -587,7 +583,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
void *entry, pfn_t pfn_t,
unsigned long flags, bool dirty)
{
struct radix_tree_root *page_tree = &mapping->page_tree;
struct radix_tree_root *pages = &mapping->i_pages;
unsigned long pfn = pfn_t_to_pfn(pfn_t);
pgoff_t index = vmf->pgoff;
void *new_entry;
@ -604,7 +600,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
unmap_mapping_pages(mapping, vmf->pgoff, 1, false);
}
spin_lock_irq(&mapping->tree_lock);
xa_lock_irq(pages);
new_entry = dax_radix_locked_entry(pfn, flags);
if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
dax_disassociate_entry(entry, mapping, false);
@ -624,17 +620,17 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
void **slot;
void *ret;
ret = __radix_tree_lookup(page_tree, index, &node, &slot);
ret = __radix_tree_lookup(pages, index, &node, &slot);
WARN_ON_ONCE(ret != entry);
__radix_tree_replace(page_tree, node, slot,
__radix_tree_replace(pages, node, slot,
new_entry, NULL);
entry = new_entry;
}
if (dirty)
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&mapping->tree_lock);
xa_unlock_irq(pages);
return entry;
}
@ -723,7 +719,7 @@ unlock_pte:
static int dax_writeback_one(struct dax_device *dax_dev,
struct address_space *mapping, pgoff_t index, void *entry)
{
struct radix_tree_root *page_tree = &mapping->page_tree;
struct radix_tree_root *pages = &mapping->i_pages;
void *entry2, **slot;
unsigned long pfn;
long ret = 0;
@ -736,7 +732,7 @@ static int dax_writeback_one(struct dax_device *dax_dev,
if (WARN_ON(!radix_tree_exceptional_entry(entry)))
return -EIO;
spin_lock_irq(&mapping->tree_lock);
xa_lock_irq(pages);
entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
/* Entry got punched out / reallocated? */
if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
@ -755,7 +751,7 @@ static int dax_writeback_one(struct dax_device *dax_dev,
}
/* Another fsync thread may have already written back this entry */
if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))
goto put_unlocked;
/* Lock the entry to serialize with page faults */
entry = lock_slot(mapping, slot);
@ -763,11 +759,11 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* We can clear the tag now but we have to be careful so that concurrent
* dax_writeback_one() calls for the same index cannot finish before we
* actually flush the caches. This is achieved as the calls will look
* at the entry only under tree_lock and once they do that they will
* see the entry locked and wait for it to unlock.
* at the entry only under the i_pages lock and once they do that
* they will see the entry locked and wait for it to unlock.
*/
radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
spin_unlock_irq(&mapping->tree_lock);
radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE);
xa_unlock_irq(pages);
/*
* Even if dax_writeback_mapping_range() was given a wbc->range_start
@ -787,16 +783,16 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* the pfn mappings are writeprotected and fault waits for mapping
* entry lock.
*/
spin_lock_irq(&mapping->tree_lock);
radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&mapping->tree_lock);
xa_lock_irq(pages);
radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY);
xa_unlock_irq(pages);
trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
put_locked_mapping_entry(mapping, index);
return ret;
put_unlocked:
put_unlocked_mapping_entry(mapping, index, entry2);
spin_unlock_irq(&mapping->tree_lock);
xa_unlock_irq(pages);
return ret;
}
@ -1566,21 +1562,21 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
pgoff_t index = vmf->pgoff;
int vmf_ret, error;
spin_lock_irq(&mapping->tree_lock);
xa_lock_irq(&mapping->i_pages);
entry = get_unlocked_mapping_entry(mapping, index, &slot);
/* Did we race with someone splitting entry or so? */
if (!entry ||
(pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
(pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
put_unlocked_mapping_entry(mapping, index, entry);
spin_unlock_irq(&mapping->tree_lock);
xa_unlock_irq(&mapping->i_pages);
trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY);
entry = lock_slot(mapping, slot);
spin_unlock_irq(&mapping->tree_lock);
xa_unlock_irq(&mapping->i_pages);
switch (pe_size) {
case PE_SIZE_PTE:
error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);

View File

@ -257,11 +257,25 @@ static void __d_free(struct rcu_head *head)
kmem_cache_free(dentry_cache, dentry);
}
static void __d_free_external_name(struct rcu_head *head)
{
struct external_name *name = container_of(head, struct external_name,
u.head);
mod_node_page_state(page_pgdat(virt_to_page(name)),
NR_INDIRECTLY_RECLAIMABLE_BYTES,
-ksize(name));
kfree(name);
}
static void __d_free_external(struct rcu_head *head)
{
struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
kfree(external_name(dentry));
kmem_cache_free(dentry_cache, dentry);
__d_free_external_name(&external_name(dentry)->u.head);
kmem_cache_free(dentry_cache, dentry);
}
static inline int dname_external(const struct dentry *dentry)
@ -291,7 +305,7 @@ void release_dentry_name_snapshot(struct name_snapshot *name)
struct external_name *p;
p = container_of(name->name, struct external_name, name[0]);
if (unlikely(atomic_dec_and_test(&p->u.count)))
kfree_rcu(p, u.head);
call_rcu(&p->u.head, __d_free_external_name);
}
}
EXPORT_SYMBOL(release_dentry_name_snapshot);
@ -1038,6 +1052,8 @@ static void shrink_dentry_list(struct list_head *list)
while (!list_empty(list)) {
struct dentry *dentry, *parent;
cond_resched();
dentry = list_entry(list->prev, struct dentry, d_lru);
spin_lock(&dentry->d_lock);
rcu_read_lock();
@ -1191,7 +1207,6 @@ void shrink_dcache_sb(struct super_block *sb)
this_cpu_sub(nr_dentry_unused, freed);
shrink_dentry_list(&dispose);
cond_resched();
} while (list_lru_count(&sb->s_dentry_lru) > 0);
}
EXPORT_SYMBOL(shrink_dcache_sb);
@ -1473,7 +1488,6 @@ void shrink_dcache_parent(struct dentry *parent)
break;
shrink_dentry_list(&data.dispose);
cond_resched();
}
}
EXPORT_SYMBOL(shrink_dcache_parent);
@ -1600,7 +1614,6 @@ void d_invalidate(struct dentry *dentry)
detach_mounts(data.mountpoint);
dput(data.mountpoint);
}
cond_resched();
}
}
EXPORT_SYMBOL(d_invalidate);
@ -1617,6 +1630,7 @@ EXPORT_SYMBOL(d_invalidate);
struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
{
struct external_name *ext = NULL;
struct dentry *dentry;
char *dname;
int err;
@ -1637,14 +1651,14 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
dname = dentry->d_iname;
} else if (name->len > DNAME_INLINE_LEN-1) {
size_t size = offsetof(struct external_name, name[1]);
struct external_name *p = kmalloc(size + name->len,
GFP_KERNEL_ACCOUNT);
if (!p) {
ext = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT);
if (!ext) {
kmem_cache_free(dentry_cache, dentry);
return NULL;
}
atomic_set(&p->u.count, 1);
dname = p->name;
atomic_set(&ext->u.count, 1);
dname = ext->name;
} else {
dname = dentry->d_iname;
}
@ -1683,6 +1697,12 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
}
}
if (unlikely(ext)) {
pg_data_t *pgdat = page_pgdat(virt_to_page(ext));
mod_node_page_state(pgdat, NR_INDIRECTLY_RECLAIMABLE_BYTES,
ksize(ext));
}
this_cpu_inc(nr_dentry);
return dentry;
@ -2770,7 +2790,7 @@ static void copy_name(struct dentry *dentry, struct dentry *target)
dentry->d_name.hash_len = target->d_name.hash_len;
}
if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
kfree_rcu(old_name, u.head);
call_rcu(&old_name->u.head, __d_free_external_name);
}
/*

View File

@ -257,7 +257,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
* to work from.
*/
limit = _STK_LIM / 4 * 3;
limit = min(limit, rlimit(RLIMIT_STACK) / 4);
limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
if (size > limit)
goto fail;
}
@ -411,6 +411,11 @@ static int bprm_mm_init(struct linux_binprm *bprm)
if (!mm)
goto err;
/* Save current stack limit for all calculations made during exec. */
task_lock(current->group_leader);
bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
task_unlock(current->group_leader);
err = __bprm_mm_init(bprm);
if (err)
goto err;
@ -697,7 +702,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
#ifdef CONFIG_STACK_GROWSUP
/* Limit stack size */
stack_base = rlimit_max(RLIMIT_STACK);
stack_base = bprm->rlim_stack.rlim_max;
if (stack_base > STACK_SIZE_MAX)
stack_base = STACK_SIZE_MAX;
@ -770,7 +775,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
* Align this down to a page boundary as expand_stack
* will align it up.
*/
rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK;
rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;
#ifdef CONFIG_STACK_GROWSUP
if (stack_size + stack_expand > rlim_stack)
stack_base = vma->vm_start + rlim_stack;
@ -1341,11 +1346,11 @@ void setup_new_exec(struct linux_binprm * bprm)
* RLIMIT_STACK, but after the point of no return to avoid
* needing to clean up the change on failure.
*/
if (current->signal->rlim[RLIMIT_STACK].rlim_cur > _STK_LIM)
current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM;
if (bprm->rlim_stack.rlim_cur > _STK_LIM)
bprm->rlim_stack.rlim_cur = _STK_LIM;
}
arch_pick_mmap_layout(current->mm);
arch_pick_mmap_layout(current->mm, &bprm->rlim_stack);
current->sas_ss_sp = current->sas_ss_size = 0;
@ -1378,6 +1383,16 @@ void setup_new_exec(struct linux_binprm * bprm)
}
EXPORT_SYMBOL(setup_new_exec);
/* Runs immediately before start_thread() takes over. */
void finalize_exec(struct linux_binprm *bprm)
{
/* Store any stack rlimit changes before starting thread. */
task_lock(current->group_leader);
current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack;
task_unlock(current->group_leader);
}
EXPORT_SYMBOL(finalize_exec);
/*
* Prepare credentials and lock ->cred_guard_mutex.
* install_exec_creds() commits the new creds and drops the lock.

View File

@ -2424,12 +2424,12 @@ void f2fs_set_page_dirty_nobuffers(struct page *page)
SetPageDirty(page);
spin_unlock(&mapping->private_lock);
spin_lock_irqsave(&mapping->tree_lock, flags);
xa_lock_irqsave(&mapping->i_pages, flags);
WARN_ON_ONCE(!PageUptodate(page));
account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
radix_tree_tag_set(&mapping->i_pages,
page_index(page), PAGECACHE_TAG_DIRTY);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
xa_unlock_irqrestore(&mapping->i_pages, flags);
unlock_page_memcg(page);
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);

View File

@ -732,10 +732,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
if (bit_pos == NR_DENTRY_IN_BLOCK &&
!truncate_hole(dir, page->index, page->index + 1)) {
spin_lock_irqsave(&mapping->tree_lock, flags);
radix_tree_tag_clear(&mapping->page_tree, page_index(page),
xa_lock_irqsave(&mapping->i_pages, flags);
radix_tree_tag_clear(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
xa_unlock_irqrestore(&mapping->i_pages, flags);
clear_page_dirty_for_io(page);
ClearPagePrivate(page);

View File

@ -1015,7 +1015,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
unsigned int init_segno = segno;
struct gc_inode_list gc_list = {
.ilist = LIST_HEAD_INIT(gc_list.ilist),
.iroot = RADIX_TREE_INIT(GFP_NOFS),
.iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
};
trace_f2fs_gc_begin(sbi->sb, sync, background,

View File

@ -226,10 +226,10 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
kunmap_atomic(src_addr);
set_page_dirty(dn.inode_page);
spin_lock_irqsave(&mapping->tree_lock, flags);
radix_tree_tag_clear(&mapping->page_tree, page_index(page),
xa_lock_irqsave(&mapping->i_pages, flags);
radix_tree_tag_clear(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
xa_unlock_irqrestore(&mapping->i_pages, flags);
set_inode_flag(inode, FI_APPEND_WRITE);
set_inode_flag(inode, FI_DATA_EXIST);

View File

@ -91,11 +91,11 @@ static void clear_node_page_dirty(struct page *page)
unsigned int long flags;
if (PageDirty(page)) {
spin_lock_irqsave(&mapping->tree_lock, flags);
radix_tree_tag_clear(&mapping->page_tree,
xa_lock_irqsave(&mapping->i_pages, flags);
radix_tree_tag_clear(&mapping->i_pages,
page_index(page),
PAGECACHE_TAG_DIRTY);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
xa_unlock_irqrestore(&mapping->i_pages, flags);
clear_page_dirty_for_io(page);
dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
@ -1161,7 +1161,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
f2fs_bug_on(sbi, check_nid_range(sbi, nid));
rcu_read_lock();
apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid);
apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid);
rcu_read_unlock();
if (apage)
return;

View File

@ -347,9 +347,9 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
* By the time control reaches here, RCU grace period has passed
* since I_WB_SWITCH assertion and all wb stat update transactions
* between unlocked_inode_to_wb_begin/end() are guaranteed to be
* synchronizing against mapping->tree_lock.
* synchronizing against the i_pages lock.
*
* Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock
* Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
* gives us exclusion against all wb related operations on @inode
* including IO list manipulations and stat updates.
*/
@ -361,7 +361,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
}
spin_lock(&inode->i_lock);
spin_lock_irq(&mapping->tree_lock);
xa_lock_irq(&mapping->i_pages);
/*
* Once I_FREEING is visible under i_lock, the eviction path owns
@ -373,22 +373,22 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
/*
* Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points
* to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
* pages actually under underwriteback.
* pages actually under writeback.
*/
radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0,
PAGECACHE_TAG_DIRTY) {
struct page *page = radix_tree_deref_slot_protected(slot,
&mapping->tree_lock);
&mapping->i_pages.xa_lock);
if (likely(page) && PageDirty(page)) {
dec_wb_stat(old_wb, WB_RECLAIMABLE);
inc_wb_stat(new_wb, WB_RECLAIMABLE);
}
}
radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0,
PAGECACHE_TAG_WRITEBACK) {
struct page *page = radix_tree_deref_slot_protected(slot,
&mapping->tree_lock);
&mapping->i_pages.xa_lock);
if (likely(page)) {
WARN_ON_ONCE(!PageWriteback(page));
dec_wb_stat(old_wb, WB_WRITEBACK);
@ -430,7 +430,7 @@ skip_switch:
*/
smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
spin_unlock_irq(&mapping->tree_lock);
xa_unlock_irq(&mapping->i_pages);
spin_unlock(&inode->i_lock);
spin_unlock(&new_wb->list_lock);
spin_unlock(&old_wb->list_lock);
@ -506,8 +506,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
* the RCU protected stat update paths to grab the mapping's
* tree_lock so that stat transfer can synchronize against them.
* the RCU protected stat update paths to grab the i_page
* lock so that stat transfer can synchronize against them.
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
*/
call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);

View File

@ -832,7 +832,7 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie,
/* Clear pointers back to the netfs */
cookie->netfs_data = NULL;
cookie->def = NULL;
BUG_ON(cookie->stores.rnode);
BUG_ON(!radix_tree_empty(&cookie->stores));
if (cookie->parent) {
ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);

View File

@ -973,7 +973,7 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj
* retire the object instead.
*/
if (!fscache_use_cookie(object)) {
ASSERT(object->cookie->stores.rnode == NULL);
ASSERT(radix_tree_empty(&object->cookie->stores));
set_bit(FSCACHE_OBJECT_RETIRED, &object->flags);
_leave(" [no cookie]");
return transit_to(KILL_OBJECT);

View File

@ -348,8 +348,7 @@ EXPORT_SYMBOL(inc_nlink);
static void __address_space_init_once(struct address_space *mapping)
{
INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC | __GFP_ACCOUNT);
spin_lock_init(&mapping->tree_lock);
INIT_RADIX_TREE(&mapping->i_pages, GFP_ATOMIC | __GFP_ACCOUNT);
init_rwsem(&mapping->i_mmap_rwsem);
INIT_LIST_HEAD(&mapping->private_list);
spin_lock_init(&mapping->private_lock);
@ -504,14 +503,14 @@ EXPORT_SYMBOL(__remove_inode_hash);
void clear_inode(struct inode *inode)
{
/*
* We have to cycle tree_lock here because reclaim can be still in the
* We have to cycle the i_pages lock here because reclaim can be in the
* process of removing the last page (in __delete_from_page_cache())
* and we must not free mapping under it.
* and we must not free the mapping under it.
*/
spin_lock_irq(&inode->i_data.tree_lock);
xa_lock_irq(&inode->i_data.i_pages);
BUG_ON(inode->i_data.nrpages);
BUG_ON(inode->i_data.nrexceptional);
spin_unlock_irq(&inode->i_data.tree_lock);
xa_unlock_irq(&inode->i_data.i_pages);
BUG_ON(!list_empty(&inode->i_data.private_list));
BUG_ON(!(inode->i_state & I_FREEING));
BUG_ON(inode->i_state & I_CLEAR);

View File

@ -193,9 +193,9 @@ retry:
(unsigned long long)oldkey,
(unsigned long long)newkey);
spin_lock_irq(&btnc->tree_lock);
err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
spin_unlock_irq(&btnc->tree_lock);
xa_lock_irq(&btnc->i_pages);
err = radix_tree_insert(&btnc->i_pages, newkey, obh->b_page);
xa_unlock_irq(&btnc->i_pages);
/*
* Note: page->index will not change to newkey until
* nilfs_btnode_commit_change_key() will be called.
@ -251,11 +251,11 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
(unsigned long long)newkey);
mark_buffer_dirty(obh);
spin_lock_irq(&btnc->tree_lock);
radix_tree_delete(&btnc->page_tree, oldkey);
radix_tree_tag_set(&btnc->page_tree, newkey,
xa_lock_irq(&btnc->i_pages);
radix_tree_delete(&btnc->i_pages, oldkey);
radix_tree_tag_set(&btnc->i_pages, newkey,
PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&btnc->tree_lock);
xa_unlock_irq(&btnc->i_pages);
opage->index = obh->b_blocknr = newkey;
unlock_page(opage);
@ -283,9 +283,9 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc,
return;
if (nbh == NULL) { /* blocksize == pagesize */
spin_lock_irq(&btnc->tree_lock);
radix_tree_delete(&btnc->page_tree, newkey);
spin_unlock_irq(&btnc->tree_lock);
xa_lock_irq(&btnc->i_pages);
radix_tree_delete(&btnc->i_pages, newkey);
xa_unlock_irq(&btnc->i_pages);
unlock_page(ctxt->bh->b_page);
} else
brelse(nbh);

View File

@ -331,15 +331,15 @@ repeat:
struct page *page2;
/* move the page to the destination cache */
spin_lock_irq(&smap->tree_lock);
page2 = radix_tree_delete(&smap->page_tree, offset);
xa_lock_irq(&smap->i_pages);
page2 = radix_tree_delete(&smap->i_pages, offset);
WARN_ON(page2 != page);
smap->nrpages--;
spin_unlock_irq(&smap->tree_lock);
xa_unlock_irq(&smap->i_pages);
spin_lock_irq(&dmap->tree_lock);
err = radix_tree_insert(&dmap->page_tree, offset, page);
xa_lock_irq(&dmap->i_pages);
err = radix_tree_insert(&dmap->i_pages, offset, page);
if (unlikely(err < 0)) {
WARN_ON(err == -EEXIST);
page->mapping = NULL;
@ -348,11 +348,11 @@ repeat:
page->mapping = dmap;
dmap->nrpages++;
if (PageDirty(page))
radix_tree_tag_set(&dmap->page_tree,
radix_tree_tag_set(&dmap->i_pages,
offset,
PAGECACHE_TAG_DIRTY);
}
spin_unlock_irq(&dmap->tree_lock);
xa_unlock_irq(&dmap->i_pages);
}
unlock_page(page);
}
@ -474,15 +474,15 @@ int __nilfs_clear_page_dirty(struct page *page)
struct address_space *mapping = page->mapping;
if (mapping) {
spin_lock_irq(&mapping->tree_lock);
xa_lock_irq(&mapping->i_pages);
if (test_bit(PG_dirty, &page->flags)) {
radix_tree_tag_clear(&mapping->page_tree,
radix_tree_tag_clear(&mapping->i_pages,
page_index(page),
PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&mapping->tree_lock);
xa_unlock_irq(&mapping->i_pages);
return clear_page_dirty_for_io(page);
}
spin_unlock_irq(&mapping->tree_lock);
xa_unlock_irq(&mapping->i_pages);
return 0;
}
return TestClearPageDirty(page);

View File

@ -141,25 +141,12 @@ static inline const char *get_task_state(struct task_struct *tsk)
return task_state_array[task_state_index(tsk)];
}
static inline int get_task_umask(struct task_struct *tsk)
{
struct fs_struct *fs;
int umask = -ENOENT;
task_lock(tsk);
fs = tsk->fs;
if (fs)
umask = fs->umask;
task_unlock(tsk);
return umask;
}
static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *p)
{
struct user_namespace *user_ns = seq_user_ns(m);
struct group_info *group_info;
int g, umask;
int g, umask = -1;
struct task_struct *tracer;
const struct cred *cred;
pid_t ppid, tpid = 0, tgid, ngid;
@ -177,17 +164,18 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
ngid = task_numa_group_id(p);
cred = get_task_cred(p);
umask = get_task_umask(p);
if (umask >= 0)
seq_printf(m, "Umask:\t%#04o\n", umask);
task_lock(p);
if (p->fs)
umask = p->fs->umask;
if (p->files)
max_fds = files_fdtable(p->files)->max_fds;
task_unlock(p);
rcu_read_unlock();
seq_printf(m, "State:\t%s", get_task_state(p));
if (umask >= 0)
seq_printf(m, "Umask:\t%#04o\n", umask);
seq_puts(m, "State:\t");
seq_puts(m, get_task_state(p));
seq_put_decimal_ull(m, "\nTgid:\t", tgid);
seq_put_decimal_ull(m, "\nNgid:\t", ngid);
@ -313,8 +301,8 @@ static void render_cap_t(struct seq_file *m, const char *header,
seq_puts(m, header);
CAP_FOR_EACH_U32(__capi) {
seq_printf(m, "%08x",
a->cap[CAP_LAST_U32 - __capi]);
seq_put_hex_ll(m, NULL,
a->cap[CAP_LAST_U32 - __capi], 8);
}
seq_putc(m, '\n');
}
@ -368,7 +356,8 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
{
seq_printf(m, "CoreDumping:\t%d\n", !!mm->core_state);
seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state);
seq_putc(m, '\n');
}
int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
@ -504,7 +493,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
/* convert nsec -> ticks */
start_time = nsec_to_clock_t(task->real_start_time);
seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state);
seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns));
seq_puts(m, " (");
seq_puts(m, tcomm);
seq_puts(m, ") ");
seq_putc(m, state);
seq_put_decimal_ll(m, " ", ppid);
seq_put_decimal_ll(m, " ", pgid);
seq_put_decimal_ll(m, " ", sid);

View File

@ -388,14 +388,17 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
unsigned long wchan;
char symname[KSYM_NAME_LEN];
if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
goto print0;
wchan = get_wchan(task);
if (wchan && !lookup_symbol_name(wchan, symname)) {
seq_puts(m, symname);
return 0;
}
if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)
&& !lookup_symbol_name(wchan, symname))
seq_printf(m, "%s", symname);
else
seq_putc(m, '0');
print0:
seq_putc(m, '0');
return 0;
}
#endif /* CONFIG_KALLSYMS */
@ -1910,6 +1913,8 @@ static int dname_to_vma_addr(struct dentry *dentry,
unsigned long long sval, eval;
unsigned int len;
if (str[0] == '0' && str[1] != '-')
return -EINVAL;
len = _parse_integer(str, 16, &sval);
if (len & KSTRTOX_OVERFLOW)
return -EINVAL;
@ -1921,6 +1926,8 @@ static int dname_to_vma_addr(struct dentry *dentry,
return -EINVAL;
str++;
if (str[0] == '0' && str[1])
return -EINVAL;
len = _parse_integer(str, 16, &eval);
if (len & KSTRTOX_OVERFLOW)
return -EINVAL;
@ -2204,6 +2211,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
}
}
up_read(&mm->mmap_sem);
mmput(mm);
for (i = 0; i < nr_files; i++) {
char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */
@ -2221,7 +2229,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
}
if (fa)
flex_array_free(fa);
mmput(mm);
out_put_task:
put_task_struct(task);

View File

@ -6,7 +6,8 @@
static int cmdline_proc_show(struct seq_file *m, void *v)
{
seq_printf(m, "%s\n", saved_command_line);
seq_puts(m, saved_command_line);
seq_putc(m, '\n');
return 0;
}

View File

@ -8,6 +8,7 @@
* Copyright (C) 1997 Theodore Ts'o
*/
#include <linux/cache.h>
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
@ -28,6 +29,17 @@
static DEFINE_RWLOCK(proc_subdir_lock);
struct kmem_cache *proc_dir_entry_cache __ro_after_init;
void pde_free(struct proc_dir_entry *pde)
{
if (S_ISLNK(pde->mode))
kfree(pde->data);
if (pde->name != pde->inline_name)
kfree(pde->name);
kmem_cache_free(proc_dir_entry_cache, pde);
}
static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int len)
{
if (len < de->namelen)
@ -40,8 +52,8 @@ static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int
static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir)
{
return rb_entry_safe(rb_first_cached(&dir->subdir),
struct proc_dir_entry, subdir_node);
return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry,
subdir_node);
}
static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir)
@ -54,7 +66,7 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
const char *name,
unsigned int len)
{
struct rb_node *node = dir->subdir.rb_root.rb_node;
struct rb_node *node = dir->subdir.rb_node;
while (node) {
struct proc_dir_entry *de = rb_entry(node,
@ -75,9 +87,8 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
static bool pde_subdir_insert(struct proc_dir_entry *dir,
struct proc_dir_entry *de)
{
struct rb_root_cached *root = &dir->subdir;
struct rb_node **new = &root->rb_root.rb_node, *parent = NULL;
bool leftmost = true;
struct rb_root *root = &dir->subdir;
struct rb_node **new = &root->rb_node, *parent = NULL;
/* Figure out where to put new node */
while (*new) {
@ -89,16 +100,15 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir,
parent = *new;
if (result < 0)
new = &(*new)->rb_left;
else if (result > 0) {
else if (result > 0)
new = &(*new)->rb_right;
leftmost = false;
} else
else
return false;
}
/* Add new node and rebalance tree. */
rb_link_node(&de->subdir_node, parent, new);
rb_insert_color_cached(&de->subdir_node, root, leftmost);
rb_insert_color(&de->subdir_node, root);
return true;
}
@ -354,6 +364,14 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
WARN(1, "name len %u\n", qstr.len);
return NULL;
}
if (qstr.len == 1 && fn[0] == '.') {
WARN(1, "name '.'\n");
return NULL;
}
if (qstr.len == 2 && fn[0] == '.' && fn[1] == '.') {
WARN(1, "name '..'\n");
return NULL;
}
if (*parent == &proc_root && name_to_int(&qstr) != ~0U) {
WARN(1, "create '/proc/%s' by hand\n", qstr.name);
return NULL;
@ -363,16 +381,26 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
return NULL;
}
ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL);
ent = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL);
if (!ent)
goto out;
if (qstr.len + 1 <= sizeof(ent->inline_name)) {
ent->name = ent->inline_name;
} else {
ent->name = kmalloc(qstr.len + 1, GFP_KERNEL);
if (!ent->name) {
pde_free(ent);
return NULL;
}
}
memcpy(ent->name, fn, qstr.len + 1);
ent->namelen = qstr.len;
ent->mode = mode;
ent->nlink = nlink;
ent->subdir = RB_ROOT_CACHED;
atomic_set(&ent->count, 1);
ent->subdir = RB_ROOT;
refcount_set(&ent->refcnt, 1);
spin_lock_init(&ent->pde_unload_lock);
INIT_LIST_HEAD(&ent->pde_openers);
proc_set_user(ent, (*parent)->uid, (*parent)->gid);
@ -395,12 +423,11 @@ struct proc_dir_entry *proc_symlink(const char *name,
strcpy((char*)ent->data,dest);
ent->proc_iops = &proc_link_inode_operations;
if (proc_register(parent, ent) < 0) {
kfree(ent->data);
kfree(ent);
pde_free(ent);
ent = NULL;
}
} else {
kfree(ent);
pde_free(ent);
ent = NULL;
}
}
@ -423,7 +450,7 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
ent->proc_iops = &proc_dir_inode_operations;
parent->nlink++;
if (proc_register(parent, ent) < 0) {
kfree(ent);
pde_free(ent);
parent->nlink--;
ent = NULL;
}
@ -458,7 +485,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name)
ent->proc_iops = NULL;
parent->nlink++;
if (proc_register(parent, ent) < 0) {
kfree(ent);
pde_free(ent);
parent->nlink--;
ent = NULL;
}
@ -495,7 +522,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
goto out_free;
return pde;
out_free:
kfree(pde);
pde_free(pde);
out:
return NULL;
}
@ -522,19 +549,12 @@ void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid)
}
EXPORT_SYMBOL(proc_set_user);
static void free_proc_entry(struct proc_dir_entry *de)
{
proc_free_inum(de->low_ino);
if (S_ISLNK(de->mode))
kfree(de->data);
kfree(de);
}
void pde_put(struct proc_dir_entry *pde)
{
if (atomic_dec_and_test(&pde->count))
free_proc_entry(pde);
if (refcount_dec_and_test(&pde->refcnt)) {
proc_free_inum(pde->low_ino);
pde_free(pde);
}
}
/*
@ -555,7 +575,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
de = pde_subdir_find(parent, fn, len);
if (de)
rb_erase_cached(&de->subdir_node, &parent->subdir);
rb_erase(&de->subdir_node, &parent->subdir);
write_unlock(&proc_subdir_lock);
if (!de) {
WARN(1, "name '%s'\n", name);
@ -592,13 +612,13 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
write_unlock(&proc_subdir_lock);
return -ENOENT;
}
rb_erase_cached(&root->subdir_node, &parent->subdir);
rb_erase(&root->subdir_node, &parent->subdir);
de = root;
while (1) {
next = pde_subdir_first(de);
if (next) {
rb_erase_cached(&next->subdir_node, &de->subdir);
rb_erase(&next->subdir_node, &de->subdir);
de = next;
continue;
}

View File

@ -54,6 +54,7 @@ static void proc_evict_inode(struct inode *inode)
}
static struct kmem_cache *proc_inode_cachep __ro_after_init;
static struct kmem_cache *pde_opener_cache __ro_after_init;
static struct inode *proc_alloc_inode(struct super_block *sb)
{
@ -92,7 +93,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
void __init proc_init_inodecache(void)
void __init proc_init_kmemcache(void)
{
proc_inode_cachep = kmem_cache_create("proc_inode_cache",
sizeof(struct proc_inode),
@ -100,6 +101,13 @@ void __init proc_init_inodecache(void)
SLAB_MEM_SPREAD|SLAB_ACCOUNT|
SLAB_PANIC),
init_once);
pde_opener_cache =
kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0,
SLAB_ACCOUNT|SLAB_PANIC, NULL);
proc_dir_entry_cache = kmem_cache_create_usercopy(
"proc_dir_entry", sizeof(struct proc_dir_entry), 0, SLAB_PANIC,
offsetof(struct proc_dir_entry, inline_name),
sizeof_field(struct proc_dir_entry, inline_name), NULL);
}
static int proc_show_options(struct seq_file *seq, struct dentry *root)
@ -138,7 +146,7 @@ static void unuse_pde(struct proc_dir_entry *pde)
complete(pde->pde_unload_completion);
}
/* pde is locked */
/* pde is locked on entry, unlocked on exit */
static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
{
/*
@ -157,9 +165,10 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
pdeo->c = &c;
spin_unlock(&pde->pde_unload_lock);
wait_for_completion(&c);
spin_lock(&pde->pde_unload_lock);
} else {
struct file *file;
struct completion *c;
pdeo->closing = true;
spin_unlock(&pde->pde_unload_lock);
file = pdeo->file;
@ -167,9 +176,11 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
spin_lock(&pde->pde_unload_lock);
/* After ->release. */
list_del(&pdeo->lh);
if (unlikely(pdeo->c))
complete(pdeo->c);
kfree(pdeo);
c = pdeo->c;
spin_unlock(&pde->pde_unload_lock);
if (unlikely(c))
complete(c);
kmem_cache_free(pde_opener_cache, pdeo);
}
}
@ -188,6 +199,7 @@ void proc_entry_rundown(struct proc_dir_entry *de)
struct pde_opener *pdeo;
pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
close_pdeo(de, pdeo);
spin_lock(&de->pde_unload_lock);
}
spin_unlock(&de->pde_unload_lock);
}
@ -338,31 +350,36 @@ static int proc_reg_open(struct inode *inode, struct file *file)
*
* Save every "struct file" with custom ->release hook.
*/
pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL);
if (!pdeo)
return -ENOMEM;
if (!use_pde(pde)) {
kfree(pdeo);
if (!use_pde(pde))
return -ENOENT;
}
open = pde->proc_fops->open;
release = pde->proc_fops->release;
release = pde->proc_fops->release;
if (release) {
pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL);
if (!pdeo) {
rv = -ENOMEM;
goto out_unuse;
}
}
open = pde->proc_fops->open;
if (open)
rv = open(inode, file);
if (rv == 0 && release) {
/* To know what to release. */
pdeo->file = file;
pdeo->closing = false;
pdeo->c = NULL;
spin_lock(&pde->pde_unload_lock);
list_add(&pdeo->lh, &pde->pde_openers);
spin_unlock(&pde->pde_unload_lock);
} else
kfree(pdeo);
if (release) {
if (rv == 0) {
/* To know what to release. */
pdeo->file = file;
pdeo->closing = false;
pdeo->c = NULL;
spin_lock(&pde->pde_unload_lock);
list_add(&pdeo->lh, &pde->pde_openers);
spin_unlock(&pde->pde_unload_lock);
} else
kmem_cache_free(pde_opener_cache, pdeo);
}
out_unuse:
unuse_pde(pde);
return rv;
}
@ -375,7 +392,7 @@ static int proc_reg_release(struct inode *inode, struct file *file)
list_for_each_entry(pdeo, &pde->pde_openers, lh) {
if (pdeo->file == file) {
close_pdeo(pde, pdeo);
break;
return 0;
}
}
spin_unlock(&pde->pde_unload_lock);

View File

@ -11,6 +11,7 @@
#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/refcount.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/binfmts.h>
@ -36,7 +37,7 @@ struct proc_dir_entry {
* negative -> it's going away RSN
*/
atomic_t in_use;
atomic_t count; /* use count */
refcount_t refcnt;
struct list_head pde_openers; /* who did ->open, but not ->release */
/* protects ->pde_openers and all struct pde_opener instances */
spinlock_t pde_unload_lock;
@ -50,13 +51,22 @@ struct proc_dir_entry {
kgid_t gid;
loff_t size;
struct proc_dir_entry *parent;
struct rb_root_cached subdir;
struct rb_root subdir;
struct rb_node subdir_node;
char *name;
umode_t mode;
u8 namelen;
char name[];
#ifdef CONFIG_64BIT
#define SIZEOF_PDE_INLINE_NAME (192-139)
#else
#define SIZEOF_PDE_INLINE_NAME (128-87)
#endif
char inline_name[SIZEOF_PDE_INLINE_NAME];
} __randomize_layout;
extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);
union proc_op {
int (*proc_get_link)(struct dentry *, struct path *);
int (*proc_show)(struct seq_file *m,
@ -159,7 +169,7 @@ int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *
static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
{
atomic_inc(&pde->count);
refcount_inc(&pde->refcnt);
return pde;
}
extern void pde_put(struct proc_dir_entry *);
@ -177,12 +187,12 @@ struct pde_opener {
struct list_head lh;
bool closing;
struct completion *c;
};
} __randomize_layout;
extern const struct inode_operations proc_link_inode_operations;
extern const struct inode_operations proc_pid_link_inode_operations;
extern void proc_init_inodecache(void);
void proc_init_kmemcache(void);
void set_proc_pid_nlink(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
extern int proc_fill_super(struct super_block *, void *data, int flags);

View File

@ -26,20 +26,7 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
static void show_val_kb(struct seq_file *m, const char *s, unsigned long num)
{
char v[32];
static const char blanks[7] = {' ', ' ', ' ', ' ',' ', ' ', ' '};
int len;
len = num_to_str(v, sizeof(v), num << (PAGE_SHIFT - 10));
seq_write(m, s, 16);
if (len > 0) {
if (len < 8)
seq_write(m, blanks, 8 - len);
seq_write(m, v, len);
}
seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8);
seq_write(m, " kB\n", 4);
}

View File

@ -192,15 +192,16 @@ static __net_init int proc_net_ns_init(struct net *net)
int err;
err = -ENOMEM;
netd = kzalloc(sizeof(*netd) + 4, GFP_KERNEL);
netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL);
if (!netd)
goto out;
netd->subdir = RB_ROOT_CACHED;
netd->subdir = RB_ROOT;
netd->data = net;
netd->nlink = 2;
netd->namelen = 3;
netd->parent = &proc_root;
netd->name = netd->inline_name;
memcpy(netd->name, "net", 4);
uid = make_kuid(net->user_ns, 0);
@ -223,7 +224,7 @@ static __net_init int proc_net_ns_init(struct net *net)
return 0;
free_net:
kfree(netd);
pde_free(netd);
out:
return err;
}
@ -231,7 +232,7 @@ out:
static __net_exit void proc_net_ns_exit(struct net *net)
{
remove_proc_entry("stat", net->proc_net);
kfree(net->proc_net);
pde_free(net->proc_net);
}
static struct pernet_operations __net_initdata proc_net_ns_ops = {

View File

@ -707,14 +707,14 @@ static bool proc_sys_link_fill_cache(struct file *file,
struct ctl_table *table)
{
bool ret = true;
head = sysctl_head_grab(head);
if (S_ISLNK(table->mode)) {
/* It is not an error if we can not follow the link ignore it */
int err = sysctl_follow_link(&head, &table);
if (err)
goto out;
}
head = sysctl_head_grab(head);
if (IS_ERR(head))
return false;
/* It is not an error if we can not follow the link ignore it */
if (sysctl_follow_link(&head, &table))
goto out;
ret = proc_sys_fill_cache(file, ctx, head, table);
out:
@ -1086,7 +1086,7 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table)
if ((table->proc_handler == proc_douintvec) ||
(table->proc_handler == proc_douintvec_minmax)) {
if (table->maxlen != sizeof(unsigned int))
err |= sysctl_err(path, table, "array now allowed");
err |= sysctl_err(path, table, "array not allowed");
}
return err;

View File

@ -123,23 +123,13 @@ static struct file_system_type proc_fs_type = {
void __init proc_root_init(void)
{
int err;
proc_init_inodecache();
proc_init_kmemcache();
set_proc_pid_nlink();
err = register_filesystem(&proc_fs_type);
if (err)
return;
proc_self_init();
proc_thread_self_init();
proc_symlink("mounts", NULL, "self/mounts");
proc_net_init();
#ifdef CONFIG_SYSVIPC
proc_mkdir("sysvipc", NULL);
#endif
proc_mkdir("fs", NULL);
proc_mkdir("driver", NULL);
proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */
@ -150,6 +140,8 @@ void __init proc_root_init(void)
proc_tty_init();
proc_mkdir("bus", NULL);
proc_sys_init();
register_filesystem(&proc_fs_type);
}
static int proc_root_getattr(const struct path *path, struct kstat *stat,
@ -207,12 +199,13 @@ struct proc_dir_entry proc_root = {
.namelen = 5,
.mode = S_IFDIR | S_IRUGO | S_IXUGO,
.nlink = 2,
.count = ATOMIC_INIT(1),
.refcnt = REFCOUNT_INIT(1),
.proc_iops = &proc_root_inode_operations,
.proc_fops = &proc_root_operations,
.parent = &proc_root,
.subdir = RB_ROOT_CACHED,
.name = "/proc",
.subdir = RB_ROOT,
.name = proc_root.inline_name,
.inline_name = "/proc",
};
int pid_ns_prepare_proc(struct pid_namespace *ns)

View File

@ -24,6 +24,8 @@
#include <asm/tlbflush.h>
#include "internal.h"
#define SEQ_PUT_DEC(str, val) \
seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8)
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
unsigned long text, lib, swap, anon, file, shmem;
@ -53,39 +55,28 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
lib = (mm->exec_vm << PAGE_SHIFT) - text;
swap = get_mm_counter(mm, MM_SWAPENTS);
seq_printf(m,
"VmPeak:\t%8lu kB\n"
"VmSize:\t%8lu kB\n"
"VmLck:\t%8lu kB\n"
"VmPin:\t%8lu kB\n"
"VmHWM:\t%8lu kB\n"
"VmRSS:\t%8lu kB\n"
"RssAnon:\t%8lu kB\n"
"RssFile:\t%8lu kB\n"
"RssShmem:\t%8lu kB\n"
"VmData:\t%8lu kB\n"
"VmStk:\t%8lu kB\n"
"VmExe:\t%8lu kB\n"
"VmLib:\t%8lu kB\n"
"VmPTE:\t%8lu kB\n"
"VmSwap:\t%8lu kB\n",
hiwater_vm << (PAGE_SHIFT-10),
total_vm << (PAGE_SHIFT-10),
mm->locked_vm << (PAGE_SHIFT-10),
mm->pinned_vm << (PAGE_SHIFT-10),
hiwater_rss << (PAGE_SHIFT-10),
total_rss << (PAGE_SHIFT-10),
anon << (PAGE_SHIFT-10),
file << (PAGE_SHIFT-10),
shmem << (PAGE_SHIFT-10),
mm->data_vm << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10),
text >> 10,
lib >> 10,
mm_pgtables_bytes(mm) >> 10,
swap << (PAGE_SHIFT-10));
SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
SEQ_PUT_DEC(" kB\nVmPin:\t", mm->pinned_vm);
SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss);
SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss);
SEQ_PUT_DEC(" kB\nRssAnon:\t", anon);
SEQ_PUT_DEC(" kB\nRssFile:\t", file);
SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem);
SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm);
SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm);
seq_put_decimal_ull_width(m,
" kB\nVmExe:\t", text >> 10, 8);
seq_put_decimal_ull_width(m,
" kB\nVmLib:\t", lib >> 10, 8);
seq_put_decimal_ull_width(m,
" kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8);
SEQ_PUT_DEC(" kB\nVmSwap:\t", swap);
seq_puts(m, " kB\n");
hugetlb_report_usage(m, mm);
}
#undef SEQ_PUT_DEC
unsigned long task_vsize(struct mm_struct *mm)
{
@ -287,15 +278,18 @@ static void show_vma_header_prefix(struct seq_file *m,
dev_t dev, unsigned long ino)
{
seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
start,
end,
flags & VM_READ ? 'r' : '-',
flags & VM_WRITE ? 'w' : '-',
flags & VM_EXEC ? 'x' : '-',
flags & VM_MAYSHARE ? 's' : 'p',
pgoff,
MAJOR(dev), MINOR(dev), ino);
seq_put_hex_ll(m, NULL, start, 8);
seq_put_hex_ll(m, "-", end, 8);
seq_putc(m, ' ');
seq_putc(m, flags & VM_READ ? 'r' : '-');
seq_putc(m, flags & VM_WRITE ? 'w' : '-');
seq_putc(m, flags & VM_EXEC ? 'x' : '-');
seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p');
seq_put_hex_ll(m, " ", pgoff, 8);
seq_put_hex_ll(m, " ", MAJOR(dev), 2);
seq_put_hex_ll(m, ":", MINOR(dev), 2);
seq_put_decimal_ull(m, " ", ino);
seq_putc(m, ' ');
}
static void
@ -694,8 +688,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
if (!mnemonics[i][0])
continue;
if (vma->vm_flags & (1UL << i)) {
seq_printf(m, "%c%c ",
mnemonics[i][0], mnemonics[i][1]);
seq_putc(m, mnemonics[i][0]);
seq_putc(m, mnemonics[i][1]);
seq_putc(m, ' ');
}
}
seq_putc(m, '\n');
@ -736,6 +731,8 @@ void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
{
}
#define SEQ_PUT_DEC(str, val) \
seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
static int show_smap(struct seq_file *m, void *v, int is_pid)
{
struct proc_maps_private *priv = m->private;
@ -809,51 +806,34 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
ret = SEQ_SKIP;
}
if (!rollup_mode)
seq_printf(m,
"Size: %8lu kB\n"
"KernelPageSize: %8lu kB\n"
"MMUPageSize: %8lu kB\n",
(vma->vm_end - vma->vm_start) >> 10,
vma_kernel_pagesize(vma) >> 10,
vma_mmu_pagesize(vma) >> 10);
if (!rollup_mode || last_vma)
seq_printf(m,
"Rss: %8lu kB\n"
"Pss: %8lu kB\n"
"Shared_Clean: %8lu kB\n"
"Shared_Dirty: %8lu kB\n"
"Private_Clean: %8lu kB\n"
"Private_Dirty: %8lu kB\n"
"Referenced: %8lu kB\n"
"Anonymous: %8lu kB\n"
"LazyFree: %8lu kB\n"
"AnonHugePages: %8lu kB\n"
"ShmemPmdMapped: %8lu kB\n"
"Shared_Hugetlb: %8lu kB\n"
"Private_Hugetlb: %7lu kB\n"
"Swap: %8lu kB\n"
"SwapPss: %8lu kB\n"
"Locked: %8lu kB\n",
mss->resident >> 10,
(unsigned long)(mss->pss >> (10 + PSS_SHIFT)),
mss->shared_clean >> 10,
mss->shared_dirty >> 10,
mss->private_clean >> 10,
mss->private_dirty >> 10,
mss->referenced >> 10,
mss->anonymous >> 10,
mss->lazyfree >> 10,
mss->anonymous_thp >> 10,
mss->shmem_thp >> 10,
mss->shared_hugetlb >> 10,
mss->private_hugetlb >> 10,
mss->swap >> 10,
(unsigned long)(mss->swap_pss >> (10 + PSS_SHIFT)),
(unsigned long)(mss->pss >> (10 + PSS_SHIFT)));
if (!rollup_mode) {
SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start);
SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma));
SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma));
seq_puts(m, " kB\n");
}
if (!rollup_mode || last_vma) {
SEQ_PUT_DEC("Rss: ", mss->resident);
SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT);
SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean);
SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty);
SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean);
SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty);
SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced);
SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous);
SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree);
SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp);
SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
mss->private_hugetlb >> 10, 7);
SEQ_PUT_DEC(" kB\nSwap: ", mss->swap);
SEQ_PUT_DEC(" kB\nSwapPss: ",
mss->swap_pss >> PSS_SHIFT);
SEQ_PUT_DEC(" kB\nLocked: ", mss->pss >> PSS_SHIFT);
seq_puts(m, " kB\n");
}
if (!rollup_mode) {
arch_show_smap(m, vma);
show_smap_vma_flags(m, vma);
@ -861,6 +841,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
m_cache_vma(m, vma);
return ret;
}
#undef SEQ_PUT_DEC
static int show_pid_smap(struct seq_file *m, void *v)
{

View File

@ -2643,7 +2643,7 @@ static int journal_init_dev(struct super_block *super,
if (IS_ERR(journal->j_dev_bd)) {
result = PTR_ERR(journal->j_dev_bd);
journal->j_dev_bd = NULL;
reiserfs_warning(super,
reiserfs_warning(super, "sh-457",
"journal_init_dev: Cannot open '%s': %i",
jdev_name, result);
return result;

View File

@ -6,6 +6,7 @@
* initial implementation -- AV, Oct 2001.
*/
#include <linux/cache.h>
#include <linux/fs.h>
#include <linux/export.h>
#include <linux/seq_file.h>
@ -19,6 +20,8 @@
#include <linux/uaccess.h>
#include <asm/page.h>
static struct kmem_cache *seq_file_cache __ro_after_init;
static void seq_set_overflow(struct seq_file *m)
{
m->count = m->size;
@ -26,7 +29,7 @@ static void seq_set_overflow(struct seq_file *m)
static void *seq_buf_alloc(unsigned long size)
{
return kvmalloc(size, GFP_KERNEL);
return kvmalloc(size, GFP_KERNEL_ACCOUNT);
}
/**
@ -51,7 +54,7 @@ int seq_open(struct file *file, const struct seq_operations *op)
WARN_ON(file->private_data);
p = kzalloc(sizeof(*p), GFP_KERNEL);
p = kmem_cache_zalloc(seq_file_cache, GFP_KERNEL);
if (!p)
return -ENOMEM;
@ -366,7 +369,7 @@ int seq_release(struct inode *inode, struct file *file)
{
struct seq_file *m = file->private_data;
kvfree(m->buf);
kfree(m);
kmem_cache_free(seq_file_cache, m);
return 0;
}
EXPORT_SYMBOL(seq_release);
@ -563,7 +566,7 @@ static void single_stop(struct seq_file *p, void *v)
int single_open(struct file *file, int (*show)(struct seq_file *, void *),
void *data)
{
struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL);
struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_ACCOUNT);
int res = -ENOMEM;
if (op) {
@ -625,7 +628,7 @@ void *__seq_open_private(struct file *f, const struct seq_operations *ops,
void *private;
struct seq_file *seq;
private = kzalloc(psize, GFP_KERNEL);
private = kzalloc(psize, GFP_KERNEL_ACCOUNT);
if (private == NULL)
goto out;
@ -673,29 +676,37 @@ void seq_puts(struct seq_file *m, const char *s)
}
EXPORT_SYMBOL(seq_puts);
/*
/**
* A helper routine for putting decimal numbers without rich format of printf().
* only 'unsigned long long' is supported.
* This routine will put strlen(delimiter) + number into seq_file.
* @m: seq_file identifying the buffer to which data should be written
* @delimiter: a string which is printed before the number
* @num: the number
* @width: a minimum field width
*
* This routine will put strlen(delimiter) + number into seq_filed.
* This routine is very quick when you show lots of numbers.
* In usual cases, it will be better to use seq_printf(). It's easier to read.
*/
void seq_put_decimal_ull(struct seq_file *m, const char *delimiter,
unsigned long long num)
void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter,
unsigned long long num, unsigned int width)
{
int len;
if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */
goto overflow;
len = strlen(delimiter);
if (m->count + len >= m->size)
goto overflow;
if (delimiter && delimiter[0]) {
if (delimiter[1] == 0)
seq_putc(m, delimiter[0]);
else
seq_puts(m, delimiter);
}
memcpy(m->buf + m->count, delimiter, len);
m->count += len;
if (!width)
width = 1;
if (m->count + 1 >= m->size)
if (m->count + width >= m->size)
goto overflow;
if (num < 10) {
@ -703,7 +714,7 @@ void seq_put_decimal_ull(struct seq_file *m, const char *delimiter,
return;
}
len = num_to_str(m->buf + m->count, m->size - m->count, num);
len = num_to_str(m->buf + m->count, m->size - m->count, num, width);
if (!len)
goto overflow;
@ -713,8 +724,60 @@ void seq_put_decimal_ull(struct seq_file *m, const char *delimiter,
overflow:
seq_set_overflow(m);
}
void seq_put_decimal_ull(struct seq_file *m, const char *delimiter,
unsigned long long num)
{
return seq_put_decimal_ull_width(m, delimiter, num, 0);
}
EXPORT_SYMBOL(seq_put_decimal_ull);
/**
* seq_put_hex_ll - put a number in hexadecimal notation
* @m: seq_file identifying the buffer to which data should be written
* @delimiter: a string which is printed before the number
* @v: the number
* @width: a minimum field width
*
* seq_put_hex_ll(m, "", v, 8) is equal to seq_printf(m, "%08llx", v)
*
* This routine is very quick when you show lots of numbers.
* In usual cases, it will be better to use seq_printf(). It's easier to read.
*/
void seq_put_hex_ll(struct seq_file *m, const char *delimiter,
unsigned long long v, unsigned int width)
{
unsigned int len;
int i;
if (delimiter && delimiter[0]) {
if (delimiter[1] == 0)
seq_putc(m, delimiter[0]);
else
seq_puts(m, delimiter);
}
/* If x is 0, the result of __builtin_clzll is undefined */
if (v == 0)
len = 1;
else
len = (sizeof(v) * 8 - __builtin_clzll(v) + 3) / 4;
if (len < width)
len = width;
if (m->count + len > m->size) {
seq_set_overflow(m);
return;
}
for (i = len - 1; i >= 0; i--) {
m->buf[m->count + i] = hex_asc[0xf & v];
v = v >> 4;
}
m->count += len;
}
void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num)
{
int len;
@ -722,12 +785,12 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num
if (m->count + 3 >= m->size) /* we'll write 2 bytes at least */
goto overflow;
len = strlen(delimiter);
if (m->count + len >= m->size)
goto overflow;
memcpy(m->buf + m->count, delimiter, len);
m->count += len;
if (delimiter && delimiter[0]) {
if (delimiter[1] == 0)
seq_putc(m, delimiter[0]);
else
seq_puts(m, delimiter);
}
if (m->count + 2 >= m->size)
goto overflow;
@ -742,7 +805,7 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num
return;
}
len = num_to_str(m->buf + m->count, m->size - m->count, num);
len = num_to_str(m->buf + m->count, m->size - m->count, num, 0);
if (!len)
goto overflow;
@ -782,8 +845,14 @@ EXPORT_SYMBOL(seq_write);
void seq_pad(struct seq_file *m, char c)
{
int size = m->pad_until - m->count;
if (size > 0)
seq_printf(m, "%*s", size, "");
if (size > 0) {
if (size + m->count > m->size) {
seq_set_overflow(m);
return;
}
memset(m->buf + m->count, ' ', size);
m->count += size;
}
if (c)
seq_putc(m, c);
}
@ -1040,3 +1109,8 @@ seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head,
return NULL;
}
EXPORT_SYMBOL(seq_hlist_next_percpu);
void __init seq_file_init(void)
{
seq_file_cache = KMEM_CACHE(seq_file, SLAB_ACCOUNT|SLAB_PANIC);
}

View File

@ -1467,19 +1467,8 @@ xfs_vm_set_page_dirty(
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
if (newly_dirty) {
/* sigh - __set_page_dirty() is static, so copy it here, too */
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(!PageUptodate(page));
account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
spin_unlock_irqrestore(&mapping->tree_lock, flags);
}
if (newly_dirty)
__set_page_dirty(page, mapping, 1);
unlock_page_memcg(page);
if (newly_dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);

View File

@ -175,7 +175,7 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
}
long congestion_wait(int sync, long timeout);
long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout);
long wait_iff_congested(int sync, long timeout);
static inline bool bdi_cap_synchronous_io(struct backing_dev_info *bdi)
{
@ -329,7 +329,7 @@ static inline bool inode_to_wb_is_valid(struct inode *inode)
* @inode: inode of interest
*
* Returns the wb @inode is currently associated with. The caller must be
* holding either @inode->i_lock, @inode->i_mapping->tree_lock, or the
* holding either @inode->i_lock, the i_pages lock, or the
* associated wb's list_lock.
*/
static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
@ -337,7 +337,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
#ifdef CONFIG_LOCKDEP
WARN_ON_ONCE(debug_locks &&
(!lockdep_is_held(&inode->i_lock) &&
!lockdep_is_held(&inode->i_mapping->tree_lock) &&
!lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) &&
!lockdep_is_held(&inode->i_wb->list_lock)));
#endif
return inode->i_wb;
@ -349,7 +349,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
* @lockedp: temp bool output param, to be passed to the end function
*
* The caller wants to access the wb associated with @inode but isn't
* holding inode->i_lock, mapping->tree_lock or wb->list_lock. This
* holding inode->i_lock, the i_pages lock or wb->list_lock. This
* function determines the wb associated with @inode and ensures that the
* association doesn't change until the transaction is finished with
* unlocked_inode_to_wb_end().
@ -370,11 +370,11 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
*lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
if (unlikely(*lockedp))
spin_lock_irq(&inode->i_mapping->tree_lock);
xa_lock_irq(&inode->i_mapping->i_pages);
/*
* Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock.
* inode_to_wb() will bark. Deref directly.
* Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages
* lock. inode_to_wb() will bark. Deref directly.
*/
return inode->i_wb;
}
@ -387,7 +387,7 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
{
if (unlikely(locked))
spin_unlock_irq(&inode->i_mapping->tree_lock);
xa_unlock_irq(&inode->i_mapping->i_pages);
rcu_read_unlock();
}

View File

@ -61,6 +61,8 @@ struct linux_binprm {
unsigned interp_flags;
unsigned interp_data;
unsigned long loader, exec;
struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */
} __randomize_layout;
#define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
@ -118,6 +120,7 @@ extern int __must_check remove_arg_zero(struct linux_binprm *);
extern int search_binary_handler(struct linux_binprm *);
extern int flush_old_exec(struct linux_binprm * bprm);
extern void setup_new_exec(struct linux_binprm * bprm);
extern void finalize_exec(struct linux_binprm *bprm);
extern void would_dump(struct linux_binprm *, struct file *);
extern int suid_dumpable;

View File

@ -17,9 +17,6 @@
*/
#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
#define randomized_struct_fields_start struct {
#define randomized_struct_fields_end };
/* all clang versions usable with the kernel support KASAN ABI version 5 */
#define KASAN_ABI_VERSION 5

View File

@ -242,6 +242,9 @@
#if defined(RANDSTRUCT_PLUGIN) && !defined(__CHECKER__)
#define __randomize_layout __attribute__((randomize_layout))
#define __no_randomize_layout __attribute__((no_randomize_layout))
/* This anon struct can add padding, so only enable it under randstruct. */
#define randomized_struct_fields_start struct {
#define randomized_struct_fields_end } __randomize_layout;
#endif
#endif /* GCC_VERSION >= 40500 */
@ -256,15 +259,6 @@
*/
#define __visible __attribute__((externally_visible))
/*
* RANDSTRUCT_PLUGIN wants to use an anonymous struct, but it is only
* possible since GCC 4.6. To provide as much build testing coverage
* as possible, this is used for all GCC 4.6+ builds, and not just on
* RANDSTRUCT_PLUGIN builds.
*/
#define randomized_struct_fields_start struct {
#define randomized_struct_fields_end } __randomize_layout;
#endif /* GCC_VERSION >= 40600 */

View File

@ -0,0 +1,9 @@
#ifndef _LINUX_CONST_H
#define _LINUX_CONST_H
#include <uapi/linux/const.h>
#define UL(x) (_UL(x))
#define ULL(x) (_ULL(x))
#endif /* _LINUX_CONST_H */

View File

@ -13,6 +13,7 @@
#include <linux/list_lru.h>
#include <linux/llist.h>
#include <linux/radix-tree.h>
#include <linux/xarray.h>
#include <linux/rbtree.h>
#include <linux/init.h>
#include <linux/pid.h>
@ -390,12 +391,11 @@ int pagecache_write_end(struct file *, struct address_space *mapping,
struct address_space {
struct inode *host; /* owner: inode, block_device */
struct radix_tree_root page_tree; /* radix tree of all pages */
spinlock_t tree_lock; /* and lock protecting it */
struct radix_tree_root i_pages; /* cached pages */
atomic_t i_mmap_writable;/* count VM_SHARED mappings */
struct rb_root_cached i_mmap; /* tree of private and shared mappings */
struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */
/* Protected by tree_lock together with the radix tree */
/* Protected by the i_pages lock */
unsigned long nrpages; /* number of total pages */
/* number of shadow or DAX exceptional entries */
unsigned long nrexceptional;
@ -1989,7 +1989,7 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
*
* I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to
* synchronize competing switching instances and to tell
* wb stat updates to grab mapping->tree_lock. See
* wb stat updates to grab the i_pages lock. See
* inode_switch_wb_work_fn() for details.
*
* I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper

View File

@ -80,76 +80,145 @@
struct hmm;
/*
* hmm_pfn_t - HMM uses its own pfn type to keep several flags per page
* hmm_pfn_flag_e - HMM flag enums
*
* Flags:
* HMM_PFN_VALID: pfn is valid
* HMM_PFN_READ: CPU page table has read permission set
* HMM_PFN_VALID: pfn is valid. It has, at least, read permission.
* HMM_PFN_WRITE: CPU page table has write permission set
* HMM_PFN_DEVICE_PRIVATE: private device memory (ZONE_DEVICE)
*
* The driver provide a flags array, if driver valid bit for an entry is bit
* 3 ie (entry & (1 << 3)) is true if entry is valid then driver must provide
* an array in hmm_range.flags with hmm_range.flags[HMM_PFN_VALID] == 1 << 3.
* Same logic apply to all flags. This is same idea as vm_page_prot in vma
* except that this is per device driver rather than per architecture.
*/
enum hmm_pfn_flag_e {
HMM_PFN_VALID = 0,
HMM_PFN_WRITE,
HMM_PFN_DEVICE_PRIVATE,
HMM_PFN_FLAG_MAX
};
/*
* hmm_pfn_value_e - HMM pfn special value
*
* Flags:
* HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory
* HMM_PFN_EMPTY: corresponding CPU page table entry is pte_none()
* HMM_PFN_NONE: corresponding CPU page table entry is pte_none()
* HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the
* result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not
* be mirrored by a device, because the entry will never have HMM_PFN_VALID
* set and the pfn value is undefined.
* HMM_PFN_DEVICE_UNADDRESSABLE: unaddressable device memory (ZONE_DEVICE)
*/
typedef unsigned long hmm_pfn_t;
#define HMM_PFN_VALID (1 << 0)
#define HMM_PFN_READ (1 << 1)
#define HMM_PFN_WRITE (1 << 2)
#define HMM_PFN_ERROR (1 << 3)
#define HMM_PFN_EMPTY (1 << 4)
#define HMM_PFN_SPECIAL (1 << 5)
#define HMM_PFN_DEVICE_UNADDRESSABLE (1 << 6)
#define HMM_PFN_SHIFT 7
/*
* hmm_pfn_t_to_page() - return struct page pointed to by a valid hmm_pfn_t
* @pfn: hmm_pfn_t to convert to struct page
* Returns: struct page pointer if pfn is a valid hmm_pfn_t, NULL otherwise
*
* If the hmm_pfn_t is valid (ie valid flag set) then return the struct page
* matching the pfn value stored in the hmm_pfn_t. Otherwise return NULL.
* Driver provide entry value for none entry, error entry and special entry,
* driver can alias (ie use same value for error and special for instance). It
* should not alias none and error or special.
*
* HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be:
* hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous,
* hmm_range.values[HMM_PFN_NONE] if there is no CPU page table
* hmm_range.values[HMM_PFN_SPECIAL] if CPU page table entry is a special one
*/
static inline struct page *hmm_pfn_t_to_page(hmm_pfn_t pfn)
enum hmm_pfn_value_e {
HMM_PFN_ERROR,
HMM_PFN_NONE,
HMM_PFN_SPECIAL,
HMM_PFN_VALUE_MAX
};
/*
* struct hmm_range - track invalidation lock on virtual address range
*
* @vma: the vm area struct for the range
* @list: all range lock are on a list
* @start: range virtual start address (inclusive)
* @end: range virtual end address (exclusive)
* @pfns: array of pfns (big enough for the range)
* @flags: pfn flags to match device driver page table
* @values: pfn value for some special case (none, special, error, ...)
* @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
* @valid: pfns array did not change since it has been fill by an HMM function
*/
struct hmm_range {
struct vm_area_struct *vma;
struct list_head list;
unsigned long start;
unsigned long end;
uint64_t *pfns;
const uint64_t *flags;
const uint64_t *values;
uint8_t pfn_shift;
bool valid;
};
/*
* hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn
* @range: range use to decode HMM pfn value
* @pfn: HMM pfn value to get corresponding struct page from
* Returns: struct page pointer if pfn is a valid HMM pfn, NULL otherwise
*
* If the HMM pfn is valid (ie valid flag set) then return the struct page
* matching the pfn value stored in the HMM pfn. Otherwise return NULL.
*/
static inline struct page *hmm_pfn_to_page(const struct hmm_range *range,
uint64_t pfn)
{
if (!(pfn & HMM_PFN_VALID))
if (pfn == range->values[HMM_PFN_NONE])
return NULL;
return pfn_to_page(pfn >> HMM_PFN_SHIFT);
if (pfn == range->values[HMM_PFN_ERROR])
return NULL;
if (pfn == range->values[HMM_PFN_SPECIAL])
return NULL;
if (!(pfn & range->flags[HMM_PFN_VALID]))
return NULL;
return pfn_to_page(pfn >> range->pfn_shift);
}
/*
* hmm_pfn_t_to_pfn() - return pfn value store in a hmm_pfn_t
* @pfn: hmm_pfn_t to extract pfn from
* Returns: pfn value if hmm_pfn_t is valid, -1UL otherwise
* hmm_pfn_to_pfn() - return pfn value store in a HMM pfn
* @range: range use to decode HMM pfn value
* @pfn: HMM pfn value to extract pfn from
* Returns: pfn value if HMM pfn is valid, -1UL otherwise
*/
static inline unsigned long hmm_pfn_t_to_pfn(hmm_pfn_t pfn)
static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range,
uint64_t pfn)
{
if (!(pfn & HMM_PFN_VALID))
if (pfn == range->values[HMM_PFN_NONE])
return -1UL;
return (pfn >> HMM_PFN_SHIFT);
if (pfn == range->values[HMM_PFN_ERROR])
return -1UL;
if (pfn == range->values[HMM_PFN_SPECIAL])
return -1UL;
if (!(pfn & range->flags[HMM_PFN_VALID]))
return -1UL;
return (pfn >> range->pfn_shift);
}
/*
* hmm_pfn_t_from_page() - create a valid hmm_pfn_t value from struct page
* @page: struct page pointer for which to create the hmm_pfn_t
* Returns: valid hmm_pfn_t for the page
* hmm_pfn_from_page() - create a valid HMM pfn value from struct page
* @range: range use to encode HMM pfn value
* @page: struct page pointer for which to create the HMM pfn
* Returns: valid HMM pfn for the page
*/
static inline hmm_pfn_t hmm_pfn_t_from_page(struct page *page)
static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range,
struct page *page)
{
return (page_to_pfn(page) << HMM_PFN_SHIFT) | HMM_PFN_VALID;
return (page_to_pfn(page) << range->pfn_shift) |
range->flags[HMM_PFN_VALID];
}
/*
* hmm_pfn_t_from_pfn() - create a valid hmm_pfn_t value from pfn
* @pfn: pfn value for which to create the hmm_pfn_t
* Returns: valid hmm_pfn_t for the pfn
* hmm_pfn_from_pfn() - create a valid HMM pfn value from pfn
* @range: range use to encode HMM pfn value
* @pfn: pfn value for which to create the HMM pfn
* Returns: valid HMM pfn for the pfn
*/
static inline hmm_pfn_t hmm_pfn_t_from_pfn(unsigned long pfn)
static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
unsigned long pfn)
{
return (pfn << HMM_PFN_SHIFT) | HMM_PFN_VALID;
return (pfn << range->pfn_shift) |
range->flags[HMM_PFN_VALID];
}
@ -218,6 +287,16 @@ enum hmm_update_type {
* @update: callback to update range on a device
*/
struct hmm_mirror_ops {
/* release() - release hmm_mirror
*
* @mirror: pointer to struct hmm_mirror
*
* This is called when the mm_struct is being released.
* The callback should make sure no references to the mirror occur
* after the callback returns.
*/
void (*release)(struct hmm_mirror *mirror);
/* sync_cpu_device_pagetables() - synchronize page tables
*
* @mirror: pointer to struct hmm_mirror
@ -261,23 +340,6 @@ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
void hmm_mirror_unregister(struct hmm_mirror *mirror);
/*
* struct hmm_range - track invalidation lock on virtual address range
*
* @list: all range lock are on a list
* @start: range virtual start address (inclusive)
* @end: range virtual end address (exclusive)
* @pfns: array of pfns (big enough for the range)
* @valid: pfns array did not change since it has been fill by an HMM function
*/
struct hmm_range {
struct list_head list;
unsigned long start;
unsigned long end;
hmm_pfn_t *pfns;
bool valid;
};
/*
* To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device
* driver lock that serializes device page table updates, then call
@ -291,17 +353,13 @@ struct hmm_range {
*
* IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID !
*/
int hmm_vma_get_pfns(struct vm_area_struct *vma,
struct hmm_range *range,
unsigned long start,
unsigned long end,
hmm_pfn_t *pfns);
bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range);
int hmm_vma_get_pfns(struct hmm_range *range);
bool hmm_vma_range_done(struct hmm_range *range);
/*
* Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will
* not migrate any device memory back to system memory. The hmm_pfn_t array will
* not migrate any device memory back to system memory. The HMM pfn array will
* be updated with the fault result and current snapshot of the CPU page table
* for the range.
*
@ -310,22 +368,26 @@ bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range);
* function returns -EAGAIN.
*
* Return value does not reflect if the fault was successful for every single
* address or not. Therefore, the caller must to inspect the hmm_pfn_t array to
* address or not. Therefore, the caller must to inspect the HMM pfn array to
* determine fault status for each address.
*
* Trying to fault inside an invalid vma will result in -EINVAL.
*
* See the function description in mm/hmm.c for further documentation.
*/
int hmm_vma_fault(struct vm_area_struct *vma,
struct hmm_range *range,
unsigned long start,
unsigned long end,
hmm_pfn_t *pfns,
bool write,
bool block);
#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
int hmm_vma_fault(struct hmm_range *range, bool block);
/* Below are for HMM internal use only! Not to be used by device driver! */
void hmm_mm_destroy(struct mm_struct *mm);
static inline void hmm_mm_init(struct mm_struct *mm)
{
mm->hmm = NULL;
}
#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */
static inline void hmm_mm_destroy(struct mm_struct *mm) {}
static inline void hmm_mm_init(struct mm_struct *mm) {}
#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
struct hmm_devmem;
@ -498,23 +560,9 @@ struct hmm_device {
struct hmm_device *hmm_device_new(void *drvdata);
void hmm_device_put(struct hmm_device *hmm_device);
#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
#endif /* IS_ENABLED(CONFIG_HMM) */
/* Below are for HMM internal use only! Not to be used by device driver! */
#if IS_ENABLED(CONFIG_HMM_MIRROR)
void hmm_mm_destroy(struct mm_struct *mm);
static inline void hmm_mm_init(struct mm_struct *mm)
{
mm->hmm = NULL;
}
#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */
static inline void hmm_mm_destroy(struct mm_struct *mm) {}
static inline void hmm_mm_init(struct mm_struct *mm) {}
#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
#else /* IS_ENABLED(CONFIG_HMM) */
static inline void hmm_mm_destroy(struct mm_struct *mm) {}
static inline void hmm_mm_init(struct mm_struct *mm) {}
#endif /* IS_ENABLED(CONFIG_HMM) */
#endif /* LINUX_HMM_H */

View File

@ -29,29 +29,31 @@ struct idr {
#define IDR_FREE 0
/* Set the IDR flag and the IDR_FREE tag */
#define IDR_RT_MARKER ((__force gfp_t)(3 << __GFP_BITS_SHIFT))
#define IDR_RT_MARKER (ROOT_IS_IDR | (__force gfp_t) \
(1 << (ROOT_TAG_SHIFT + IDR_FREE)))
#define IDR_INIT_BASE(base) { \
.idr_rt = RADIX_TREE_INIT(IDR_RT_MARKER), \
#define IDR_INIT_BASE(name, base) { \
.idr_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER), \
.idr_base = (base), \
.idr_next = 0, \
}
/**
* IDR_INIT() - Initialise an IDR.
* @name: Name of IDR.
*
* A freshly-initialised IDR contains no IDs.
*/
#define IDR_INIT IDR_INIT_BASE(0)
#define IDR_INIT(name) IDR_INIT_BASE(name, 0)
/**
* DEFINE_IDR() - Define a statically-allocated IDR
* @name: Name of IDR
* DEFINE_IDR() - Define a statically-allocated IDR.
* @name: Name of IDR.
*
* An IDR defined using this macro is ready for use with no additional
* initialisation required. It contains no IDs.
*/
#define DEFINE_IDR(name) struct idr name = IDR_INIT
#define DEFINE_IDR(name) struct idr name = IDR_INIT(name)
/**
* idr_get_cursor - Return the current position of the cyclic allocator
@ -218,10 +220,10 @@ struct ida {
struct radix_tree_root ida_rt;
};
#define IDA_INIT { \
.ida_rt = RADIX_TREE_INIT(IDR_RT_MARKER | GFP_NOWAIT), \
#define IDA_INIT(name) { \
.ida_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER | GFP_NOWAIT), \
}
#define DEFINE_IDA(name) struct ida name = IDA_INIT
#define DEFINE_IDA(name) struct ida name = IDA_INIT(name)
int ida_pre_get(struct ida *ida, gfp_t gfp_mask);
int ida_get_new_above(struct ida *ida, int starting_id, int *p_id);

View File

@ -439,7 +439,8 @@ extern long simple_strtol(const char *,char **,unsigned int);
extern unsigned long long simple_strtoull(const char *,char **,unsigned int);
extern long long simple_strtoll(const char *,char **,unsigned int);
extern int num_to_str(char *buf, int size, unsigned long long num);
extern int num_to_str(char *buf, int size,
unsigned long long num, unsigned int width);
/* lib/printf utilities */
@ -543,6 +544,7 @@ extern enum system_states {
SYSTEM_RESTART,
} system_state;
/* This cannot be an enum because some may be used in assembly source. */
#define TAINT_PROPRIETARY_MODULE 0
#define TAINT_FORCED_MODULE 1
#define TAINT_CPU_OUT_OF_SPEC 2
@ -560,7 +562,8 @@ extern enum system_states {
#define TAINT_SOFTLOCKUP 14
#define TAINT_LIVEPATCH 15
#define TAINT_AUX 16
#define TAINT_FLAGS_COUNT 17
#define TAINT_RANDSTRUCT 17
#define TAINT_FLAGS_COUNT 18
struct taint_flag {
char c_true; /* character printed when tainted */

View File

@ -41,11 +41,11 @@
*/
/*
* Note about locking : There is no locking required until only * one reader
* and one writer is using the fifo and no kfifo_reset() will be * called
* kfifo_reset_out() can be safely used, until it will be only called
* Note about locking: There is no locking required until only one reader
* and one writer is using the fifo and no kfifo_reset() will be called.
* kfifo_reset_out() can be safely used, until it will be only called
* in the reader thread.
* For multiple writer and one reader there is only a need to lock the writer.
* For multiple writer and one reader there is only a need to lock the writer.
* And vice versa for only one writer and multiple reader there is only a need
* to lock the reader.
*/

View File

@ -48,13 +48,12 @@ enum memcg_stat_item {
MEMCG_NR_STAT,
};
/* Cgroup-specific events, on top of universal VM events */
enum memcg_event_item {
MEMCG_LOW = NR_VM_EVENT_ITEMS,
enum memcg_memory_event {
MEMCG_LOW,
MEMCG_HIGH,
MEMCG_MAX,
MEMCG_OOM,
MEMCG_NR_EVENTS,
MEMCG_NR_MEMORY_EVENTS,
};
struct mem_cgroup_reclaim_cookie {
@ -88,7 +87,7 @@ enum mem_cgroup_events_target {
struct mem_cgroup_stat_cpu {
long count[MEMCG_NR_STAT];
unsigned long events[MEMCG_NR_EVENTS];
unsigned long events[NR_VM_EVENT_ITEMS];
unsigned long nr_page_events;
unsigned long targets[MEM_CGROUP_NTARGETS];
};
@ -120,6 +119,9 @@ struct mem_cgroup_per_node {
unsigned long usage_in_excess;/* Set to the value by which */
/* the soft limit is exceeded*/
bool on_tree;
bool congested; /* memcg has many dirty pages */
/* backed by a congested BDI */
struct mem_cgroup *memcg; /* Back pointer, we cannot */
/* use container_of */
};
@ -202,7 +204,8 @@ struct mem_cgroup {
/* OOM-Killer disable */
int oom_kill_disable;
/* handle for "memory.events" */
/* memory.events */
atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
struct cgroup_file events_file;
/* protect arrays of thresholds */
@ -231,9 +234,10 @@ struct mem_cgroup {
struct task_struct *move_lock_task;
unsigned long move_lock_flags;
/* memory.stat */
struct mem_cgroup_stat_cpu __percpu *stat_cpu;
atomic_long_t stat[MEMCG_NR_STAT];
atomic_long_t events[MEMCG_NR_EVENTS];
atomic_long_t events[NR_VM_EVENT_ITEMS];
unsigned long socket_pressure;
@ -645,9 +649,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
unsigned long *total_scanned);
/* idx can be of type enum memcg_event_item or vm_event_item */
static inline void __count_memcg_events(struct mem_cgroup *memcg,
int idx, unsigned long count)
enum vm_event_item idx,
unsigned long count)
{
unsigned long x;
@ -663,7 +667,8 @@ static inline void __count_memcg_events(struct mem_cgroup *memcg,
}
static inline void count_memcg_events(struct mem_cgroup *memcg,
int idx, unsigned long count)
enum vm_event_item idx,
unsigned long count)
{
unsigned long flags;
@ -672,9 +677,8 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
local_irq_restore(flags);
}
/* idx can be of type enum memcg_event_item or vm_event_item */
static inline void count_memcg_page_event(struct page *page,
int idx)
enum vm_event_item idx)
{
if (page->mem_cgroup)
count_memcg_events(page->mem_cgroup, idx, 1);
@ -698,10 +702,10 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
rcu_read_unlock();
}
static inline void mem_cgroup_event(struct mem_cgroup *memcg,
enum memcg_event_item event)
static inline void memcg_memory_event(struct mem_cgroup *memcg,
enum memcg_memory_event event)
{
count_memcg_events(memcg, event, 1);
atomic_long_inc(&memcg->memory_events[event]);
cgroup_file_notify(&memcg->events_file);
}
@ -721,8 +725,8 @@ static inline bool mem_cgroup_disabled(void)
return true;
}
static inline void mem_cgroup_event(struct mem_cgroup *memcg,
enum memcg_event_item event)
static inline void memcg_memory_event(struct mem_cgroup *memcg,
enum memcg_memory_event event)
{
}

View File

@ -216,9 +216,6 @@ void put_online_mems(void);
void mem_hotplug_begin(void);
void mem_hotplug_done(void);
extern void set_zone_contiguous(struct zone *zone);
extern void clear_zone_contiguous(struct zone *zone);
#else /* ! CONFIG_MEMORY_HOTPLUG */
#define pfn_to_online_page(pfn) \
({ \

View File

@ -7,8 +7,7 @@
#include <linux/migrate_mode.h>
#include <linux/hugetlb.h>
typedef struct page *new_page_t(struct page *page, unsigned long private,
int **reason);
typedef struct page *new_page_t(struct page *page, unsigned long private);
typedef void free_page_t(struct page *page, unsigned long private);
/*
@ -43,9 +42,9 @@ static inline struct page *new_page_nodemask(struct page *page,
return alloc_huge_page_nodemask(page_hstate(compound_head(page)),
preferred_nid, nodemask);
if (thp_migration_supported() && PageTransHuge(page)) {
order = HPAGE_PMD_ORDER;
if (PageTransHuge(page)) {
gfp_mask |= GFP_TRANSHUGE;
order = HPAGE_PMD_ORDER;
}
if (PageHighMem(page) || (zone_idx(page_zone(page)) == ZONE_MOVABLE))

View File

@ -747,7 +747,7 @@ int finish_mkwrite_fault(struct vm_fault *vmf);
* refcount. The each user mapping also has a reference to the page.
*
* The pagecache pages are stored in a per-mapping radix tree, which is
* rooted at mapping->page_tree, and indexed by offset.
* rooted at mapping->i_pages, and indexed by offset.
* Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
* lists, we instead now tag pages as dirty/writeback in the radix tree.
*
@ -1466,6 +1466,7 @@ extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
extern void do_invalidatepage(struct page *page, unsigned int offset,
unsigned int length);
void __set_page_dirty(struct page *, struct address_space *, int warn);
int __set_page_dirty_nobuffers(struct page *page);
int __set_page_dirty_no_writeback(struct page *page);
int redirty_page_for_writepage(struct writeback_control *wbc,
@ -2108,6 +2109,7 @@ extern void setup_per_cpu_pageset(void);
extern void zone_pcp_update(struct zone *zone);
extern void zone_pcp_reset(struct zone *zone);
extern void setup_zone_pageset(struct zone *zone);
/* page_alloc.c */
extern int min_free_kbytes;

View File

@ -180,6 +180,7 @@ enum node_stat_item {
NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */
NR_DIRTIED, /* page dirtyings since bootup */
NR_WRITTEN, /* page writings since bootup */
NR_INDIRECTLY_RECLAIMABLE_BYTES, /* measured in bytes */
NR_VM_NODE_STAT_ITEMS
};
@ -884,7 +885,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
int watermark_scale_factor_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES];
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,

View File

@ -63,7 +63,6 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
bool skip_hwpoisoned_pages);
struct page *alloc_migrate_target(struct page *page, unsigned long private,
int **resultp);
struct page *alloc_migrate_target(struct page *page, unsigned long private);
#endif

Some files were not shown because too many files have changed in this diff Show More