alistair23-linux/arch/ia64/lib/strlen.S

/* SPDX-License-Identifier: GPL-2.0 */
/*
 *
 * Optimized version of the standard strlen() function
 *
 *
 * Inputs:
 *	in0	address of string
 *
 * Outputs:
 *	ret0	the number of characters in the string (0 if empty string)
 *	does not count the \0
 *
 * Copyright (C) 1999, 2001 Hewlett-Packard Co
 *	Stephane Eranian <eranian@hpl.hp.com>
 *
 * 09/24/99 S.Eranian add speculation recovery code
 */

#include <asm/asmmacro.h>
#include <asm/export.h>

//
//
// This is an enhanced version of the basic strlen. it includes a combination
// of compute zero index (czx), parallel comparisons, speculative loads and
// loop unroll using rotating registers.
//
// General Ideas about the algorithm:
//	  The goal is to look at the string in chunks of 8 bytes.
//	  so we need to do a few extra checks at the beginning because the
//	  string may not be 8-byte aligned. In this case we load the 8byte
//	  quantity which includes the start of the string and mask the unused
//	  bytes with 0xff to avoid confusing czx.
//	  We use speculative loads and software pipelining to hide memory
//	  latency and do read ahead safely. This way we defer any exception.
//
//	  Because we don't want the kernel to be relying on particular
//	  settings of the DCR register, we provide recovery code in case
//	  speculation fails. The recovery code is going to "redo" the work using
//	  only normal loads. If we still get a fault then we generate a
//	  kernel panic. Otherwise we return the strlen as usual.
//
//	  The fact that speculation may fail can be caused, for instance, by
//	  the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
//	  a NaT bit will be set if the translation is not present. The normal
//	  load, on the other hand, will cause the translation to be inserted
//	  if the mapping exists.
//
//	  It should be noted that we execute recovery code only when we need
//	  to use the data that has been speculatively loaded: we don't execute
//	  recovery code on pure read ahead data.
//
// Remarks:
//	- the cmp r0,r0 is used as a fast way to initialize a predicate
//	  register to 1. This is required to make sure that we get the parallel
//	  compare correct.
//
//	- we don't use the epilogue counter to exit the loop but we need to set
//	  it to zero beforehand.
//
//	- after the loop we must test for Nat values because neither the
//	  czx nor cmp instruction raise a NaT consumption fault. We must be
//	  careful not to look too far for a Nat for which we don't care.
//	  For instance we don't need to look at a NaT in val2 if the zero byte
//	  was in val1.
//
//	- Clearly performance tuning is required.
//
//
//
#define saved_pfs	r11
#define	tmp		r10
#define base		r16
#define orig		r17
#define saved_pr	r18
#define src		r19
#define mask		r20
#define val		r21
#define val1		r22
#define val2		r23

GLOBAL_ENTRY(strlen)
	.prologue
	.save ar.pfs, saved_pfs
	alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8

	.rotr v[2], w[2]	// declares our 4 aliases

	extr.u tmp=in0,0,3	// tmp=least significant 3 bits
	mov orig=in0		// keep trackof initial byte address
	dep src=0,in0,0,3	// src=8byte-aligned in0 address
	.save pr, saved_pr
	mov saved_pr=pr		// preserve predicates (rotation)
	;;

	.body

	ld8 v[1]=[src],8	// must not speculate: can fail here
	shl tmp=tmp,3		// multiply by 8bits/byte
	mov mask=-1		// our mask
	;;
	ld8.s w[1]=[src],8	// speculatively load next
	cmp.eq p6,p0=r0,r0	// sets p6 to true for cmp.and
	sub tmp=64,tmp		// how many bits to shift our mask on the right
	;;
	shr.u	mask=mask,tmp	// zero enough bits to hold v[1] valuable part
	mov ar.ec=r0		// clear epilogue counter (saved in ar.pfs)
	;;
	add base=-16,src	// keep track of aligned base
	or v[1]=v[1],mask	// now we have a safe initial byte pattern
	;;
1:
	ld8.s v[0]=[src],8	// speculatively load next
	czx1.r val1=v[1]	// search 0 byte from right
	czx1.r val2=w[1]	// search 0 byte from right following 8bytes
	;;
	ld8.s w[0]=[src],8	// speculatively load next to next
	cmp.eq.and p6,p0=8,val1	// p6 = p6 and val1==8
	cmp.eq.and p6,p0=8,val2	// p6 = p6 and mask==8
(p6)	br.wtop.dptk 1b		// loop until p6 == 0
	;;
	//
	// We must return try the recovery code iff
	// val1_is_nat || (val1==8 && val2_is_nat)
	//
	// XXX Fixme
	//	- there must be a better way of doing the test
	//
	cmp.eq  p8,p9=8,val1	// p6 = val1 had zero (disambiguate)
	tnat.nz p6,p7=val1	// test NaT on val1
(p6)	br.cond.spnt .recover	// jump to recovery if val1 is NaT
	;;
	//
	// if we come here p7 is true, i.e., initialized for // cmp
	//
	cmp.eq.and  p7,p0=8,val1// val1==8?
	tnat.nz.and p7,p0=val2	// test NaT if val2
(p7)	br.cond.spnt .recover	// jump to recovery if val2 is NaT
	;;
(p8)	mov val1=val2		// the other test got us out of the loop
(p8)	adds src=-16,src	// correct position when 3 ahead
(p9)	adds src=-24,src	// correct position when 4 ahead
	;;
	sub ret0=src,orig	// distance from base
	sub tmp=8,val1		// which byte in word
	mov pr=saved_pr,0xffffffffffff0000
	;;
	sub ret0=ret0,tmp	// adjust
	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
	br.ret.sptk.many rp	// end of normal execution

	//
	// Outlined recovery code when speculation failed
	//
	// This time we don't use speculation and rely on the normal exception
	// mechanism. that's why the loop is not as good as the previous one
	// because read ahead is not possible
	//
	// IMPORTANT:
	// Please note that in the case of strlen() as opposed to strlen_user()
	// we don't use the exception mechanism, as this function is not
	// supposed to fail. If that happens it means we have a bug and the
	// code will cause of kernel fault.
	//
	// XXX Fixme
	//	- today we restart from the beginning of the string instead
	//	  of trying to continue where we left off.
	//
.recover:
	ld8 val=[base],8	// will fail if unrecoverable fault
	;;
	or val=val,mask		// remask first bytes
	cmp.eq p0,p6=r0,r0	// nullify first ld8 in loop
	;;
	//
	// ar.ec is still zero here
	//
2:
(p6)	ld8 val=[base],8	// will fail if unrecoverable fault
	;;
	czx1.r val1=val		// search 0 byte from right
	;;
	cmp.eq p6,p0=8,val1	// val1==8 ?
(p6)	br.wtop.dptk 2b		// loop until p6 == 0
	;;			// (avoid WAW on p63)
	sub ret0=base,orig	// distance from base
	sub tmp=8,val1
	mov pr=saved_pr,0xffffffffffff0000
	;;
	sub ret0=ret0,tmp	// length=now - back -1
	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
	br.ret.sptk.many rp	// end of successful recovery code
END(strlen)
EXPORT_SYMBOL(strlen)
License cleanup: add SPDX GPL-2.0 license identifier to files with no license Many source files in the tree are missing licensing information, which makes it harder for compliance tools to determine the correct license. By default all files without license information are under the default license of the kernel, which is GPL version 2. Update the files which contain no license information with the 'GPL-2.0' SPDX license identifier. The SPDX identifier is a legally binding shorthand, which can be used instead of the full boiler plate text. This patch is based on work done by Thomas Gleixner and Kate Stewart and Philippe Ombredanne. How this work was done: Patches were generated and checked against linux-4.14-rc6 for a subset of the use cases: - file had no licensing information it it. - file was a /uapi/ one with no licensing information in it, - file was a /uapi/ one with existing licensing information, Further patches will be generated in subsequent months to fix up cases where non-standard license headers were used, and references to license had to be inferred by heuristics based on keywords. The analysis to determine which SPDX License Identifier to be applied to a file was done in a spreadsheet of side by side results from of the output of two independent scanners (ScanCode & Windriver) producing SPDX tag:value files created by Philippe Ombredanne. Philippe prepared the base worksheet, and did an initial spot review of a few 1000 files. The 4.13 kernel was the starting point of the analysis with 60,537 files assessed. Kate Stewart did a file by file comparison of the scanner results in the spreadsheet to determine which SPDX license identifier(s) to be applied to the file. She confirmed any determination that was not immediately clear with lawyers working with the Linux Foundation. Criteria used to select files for SPDX license identifier tagging was: - Files considered eligible had to be source code files. - Make and config files were included as candidates if they contained >5 lines of source - File already had some variant of a license header in it (even if <5 lines). All documentation files were explicitly excluded. The following heuristics were used to determine which SPDX license identifiers to apply. - when both scanners couldn't find any license traces, file was considered to have no license information in it, and the top level COPYING file license applied. For non /uapi/ files that summary was: SPDX license identifier # files ---------------------------------------------------\|------- GPL-2.0 11139 and resulted in the first patch in this series. If that file was a /uapi/ path one, it was "GPL-2.0 WITH Linux-syscall-note" otherwise it was "GPL-2.0". Results of that was: SPDX license identifier # files ---------------------------------------------------\|------- GPL-2.0 WITH Linux-syscall-note 930 and resulted in the second patch in this series. - if a file had some form of licensing information in it, and was one of the /uapi/ ones, it was denoted with the Linux-syscall-note if any GPL family license was found in the file or had no licensing in it (per prior point). Results summary: SPDX license identifier # files ---------------------------------------------------\|------ GPL-2.0 WITH Linux-syscall-note 270 GPL-2.0+ WITH Linux-syscall-note 169 ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) 21 ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) 17 LGPL-2.1+ WITH Linux-syscall-note 15 GPL-1.0+ WITH Linux-syscall-note 14 ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) 5 LGPL-2.0+ WITH Linux-syscall-note 4 LGPL-2.1 WITH Linux-syscall-note 3 ((GPL-2.0 WITH Linux-syscall-note) OR MIT) 3 ((GPL-2.0 WITH Linux-syscall-note) AND MIT) 1 and that resulted in the third patch in this series. - when the two scanners agreed on the detected license(s), that became the concluded license(s). - when there was disagreement between the two scanners (one detected a license but the other didn't, or they both detected different licenses) a manual inspection of the file occurred. - In most cases a manual inspection of the information in the file resulted in a clear resolution of the license that should apply (and which scanner probably needed to revisit its heuristics). - When it was not immediately clear, the license identifier was confirmed with lawyers working with the Linux Foundation. - If there was any question as to the appropriate license identifier, the file was flagged for further research and to be revisited later in time. In total, over 70 hours of logged manual review was done on the spreadsheet to determine the SPDX license identifiers to apply to the source files by Kate, Philippe, Thomas and, in some cases, confirmation by lawyers working with the Linux Foundation. Kate also obtained a third independent scan of the 4.13 code base from FOSSology, and compared selected files where the other two scanners disagreed against that SPDX file, to see if there was new insights. The Windriver scanner is based on an older version of FOSSology in part, so they are related. Thomas did random spot checks in about 500 files from the spreadsheets for the uapi headers and agreed with SPDX license identifier in the files he inspected. For the non-uapi files Thomas did random spot checks in about 15000 files. In initial set of patches against 4.14-rc6, 3 files were found to have copy/paste license identifier errors, and have been fixed to reflect the correct identifier. Additionally Philippe spent 10 hours this week doing a detailed manual inspection and review of the 12,461 patched files from the initial patch version early this week with: - a full scancode scan run, collecting the matched texts, detected license ids and scores - reviewing anything where there was a license detected (about 500+ files) to ensure that the applied SPDX license was correct - reviewing anything where there was no detection but the patch license was not GPL-2.0 WITH Linux-syscall-note to ensure that the applied SPDX license was correct This produced a worksheet with 20 files needing minor correction. This worksheet was then exported into 3 different .csv files for the different types of files to be modified. These .csv files were then reviewed by Greg. Thomas wrote a script to parse the csv files and add the proper SPDX tag to the file, in the format that the file expected. This script was further refined by Greg based on the output to detect more types of files automatically and to distinguish between header and source .c files (which need different comment types.) Finally Greg ran the script using the .csv files to generate the patches. Reviewed-by: Kate Stewart <kstewart@linuxfoundation.org> Reviewed-by: Philippe Ombredanne <pombredanne@nexb.com> Reviewed-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> 2017-11-01 08:07:57 -06:00			`/* SPDX-License-Identifier: GPL-2.0 */`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 16:20:36 -06:00			`/*`
			`*`
			`* Optimized version of the standard strlen() function`
			`*`
			`*`
			`* Inputs:`
			`* in0 address of string`
			`*`
			`* Outputs:`
			`* ret0 the number of characters in the string (0 if empty string)`
			`* does not count the \0`
			`*`
			`* Copyright (C) 1999, 2001 Hewlett-Packard Co`
			`* Stephane Eranian <eranian@hpl.hp.com>`
			`*`
			`* 09/24/99 S.Eranian add speculation recovery code`
			`*/`

			`#include <asm/asmmacro.h>`
ia64: move exports to definitions Here we have another kind of deviation from the default case - a difference between exporting functions and non-functions. EXPORT_DATA_SYMBOL... is really different from EXPORT_SYMBOL... on ia64, and we need to use the right one when moving exports from .c where C compiler has the required information to .S, where we need to supply it manually. parisc64 will be another one like that. Tested-by: Tony Luck <tony.luck@intel.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> 2016-01-16 23:13:41 -07:00			`#include <asm/export.h>`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 16:20:36 -06:00
			`//`
			`//`
			`// This is an enhanced version of the basic strlen. it includes a combination`
			`// of compute zero index (czx), parallel comparisons, speculative loads and`
			`// loop unroll using rotating registers.`
			`//`
			`// General Ideas about the algorithm:`
			`// The goal is to look at the string in chunks of 8 bytes.`
			`// so we need to do a few extra checks at the beginning because the`
			`// string may not be 8-byte aligned. In this case we load the 8byte`
			`// quantity which includes the start of the string and mask the unused`
			`// bytes with 0xff to avoid confusing czx.`
			`// We use speculative loads and software pipelining to hide memory`
			`// latency and do read ahead safely. This way we defer any exception.`
			`//`
			`// Because we don't want the kernel to be relying on particular`
			`// settings of the DCR register, we provide recovery code in case`
			`// speculation fails. The recovery code is going to "redo" the work using`
			`// only normal loads. If we still get a fault then we generate a`
			`// kernel panic. Otherwise we return the strlen as usual.`
			`//`
			`// The fact that speculation may fail can be caused, for instance, by`
			`// the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,`
			`// a NaT bit will be set if the translation is not present. The normal`
			`// load, on the other hand, will cause the translation to be inserted`
			`// if the mapping exists.`
			`//`
			`// It should be noted that we execute recovery code only when we need`
			`// to use the data that has been speculatively loaded: we don't execute`
			`// recovery code on pure read ahead data.`
			`//`
			`// Remarks:`
			`// - the cmp r0,r0 is used as a fast way to initialize a predicate`
			`// register to 1. This is required to make sure that we get the parallel`
			`// compare correct.`
			`//`
			`// - we don't use the epilogue counter to exit the loop but we need to set`
			`// it to zero beforehand.`
			`//`
			`// - after the loop we must test for Nat values because neither the`
			`// czx nor cmp instruction raise a NaT consumption fault. We must be`
			`// careful not to look too far for a Nat for which we don't care.`
			`// For instance we don't need to look at a NaT in val2 if the zero byte`
			`// was in val1.`
			`//`
			`// - Clearly performance tuning is required.`
			`//`
			`//`
			`//`
			`#define saved_pfs r11`
			`#define tmp r10`
			`#define base r16`
			`#define orig r17`
			`#define saved_pr r18`
			`#define src r19`
			`#define mask r20`
			`#define val r21`
			`#define val1 r22`
			`#define val2 r23`

			`GLOBAL_ENTRY(strlen)`
			`.prologue`
			`.save ar.pfs, saved_pfs`
			`alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8`

			`.rotr v[2], w[2] // declares our 4 aliases`

			`extr.u tmp=in0,0,3 // tmp=least significant 3 bits`
			`mov orig=in0 // keep trackof initial byte address`
			`dep src=0,in0,0,3 // src=8byte-aligned in0 address`
			`.save pr, saved_pr`
			`mov saved_pr=pr // preserve predicates (rotation)`
			`;;`

			`.body`

			`ld8 v[1]=[src],8 // must not speculate: can fail here`
			`shl tmp=tmp,3 // multiply by 8bits/byte`
			`mov mask=-1 // our mask`
			`;;`
			`ld8.s w[1]=[src],8 // speculatively load next`
			`cmp.eq p6,p0=r0,r0 // sets p6 to true for cmp.and`
			`sub tmp=64,tmp // how many bits to shift our mask on the right`
			`;;`
			`shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part`
			`mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs)`
			`;;`
			`add base=-16,src // keep track of aligned base`
			`or v[1]=v[1],mask // now we have a safe initial byte pattern`
			`;;`
			`1:`
			`ld8.s v[0]=[src],8 // speculatively load next`
			`czx1.r val1=v[1] // search 0 byte from right`
			`czx1.r val2=w[1] // search 0 byte from right following 8bytes`
			`;;`
			`ld8.s w[0]=[src],8 // speculatively load next to next`
			`cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8`
			`cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8`
			`(p6) br.wtop.dptk 1b // loop until p6 == 0`
			`;;`
			`//`
			`// We must return try the recovery code iff`
			`// val1_is_nat \|\| (val1==8 && val2_is_nat)`
			`//`
			`// XXX Fixme`
			`// - there must be a better way of doing the test`
			`//`
			`cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate)`
			`tnat.nz p6,p7=val1 // test NaT on val1`
			`(p6) br.cond.spnt .recover // jump to recovery if val1 is NaT`
			`;;`
			`//`
			`// if we come here p7 is true, i.e., initialized for // cmp`
			`//`
			`cmp.eq.and p7,p0=8,val1// val1==8?`
			`tnat.nz.and p7,p0=val2 // test NaT if val2`
			`(p7) br.cond.spnt .recover // jump to recovery if val2 is NaT`
			`;;`
			`(p8) mov val1=val2 // the other test got us out of the loop`
			`(p8) adds src=-16,src // correct position when 3 ahead`
			`(p9) adds src=-24,src // correct position when 4 ahead`
			`;;`
			`sub ret0=src,orig // distance from base`
			`sub tmp=8,val1 // which byte in word`
			`mov pr=saved_pr,0xffffffffffff0000`
			`;;`
			`sub ret0=ret0,tmp // adjust`
			`mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what`
			`br.ret.sptk.many rp // end of normal execution`

			`//`
			`// Outlined recovery code when speculation failed`
			`//`
			`// This time we don't use speculation and rely on the normal exception`
			`// mechanism. that's why the loop is not as good as the previous one`
			`// because read ahead is not possible`
			`//`
			`// IMPORTANT:`
			`// Please note that in the case of strlen() as opposed to strlen_user()`
			`// we don't use the exception mechanism, as this function is not`
			`// supposed to fail. If that happens it means we have a bug and the`
			`// code will cause of kernel fault.`
			`//`
			`// XXX Fixme`
			`// - today we restart from the beginning of the string instead`
			`// of trying to continue where we left off.`
			`//`
			`.recover:`
			`ld8 val=[base],8 // will fail if unrecoverable fault`
			`;;`
			`or val=val,mask // remask first bytes`
			`cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop`
			`;;`
			`//`
			`// ar.ec is still zero here`
			`//`
			`2:`
			`(p6) ld8 val=[base],8 // will fail if unrecoverable fault`
			`;;`
			`czx1.r val1=val // search 0 byte from right`
			`;;`
			`cmp.eq p6,p0=8,val1 // val1==8 ?`
			`(p6) br.wtop.dptk 2b // loop until p6 == 0`
			`;; // (avoid WAW on p63)`
			`sub ret0=base,orig // distance from base`
			`sub tmp=8,val1`
			`mov pr=saved_pr,0xffffffffffff0000`
			`;;`
			`sub ret0=ret0,tmp // length=now - back -1`
			`mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what`
			`br.ret.sptk.many rp // end of successful recovery code`
			`END(strlen)`
ia64: move exports to definitions Here we have another kind of deviation from the default case - a difference between exporting functions and non-functions. EXPORT_DATA_SYMBOL... is really different from EXPORT_SYMBOL... on ia64, and we need to use the right one when moving exports from .c where C compiler has the required information to .S, where we need to supply it manually. parisc64 will be another one like that. Tested-by: Tony Luck <tony.luck@intel.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> 2016-01-16 23:13:41 -07:00			`EXPORT_SYMBOL(strlen)`