From 2c906b317b2d9c7e32b0d513e102bd68a2c49112 Mon Sep 17 00:00:00 2001 From: Alexander Clouter Date: Wed, 22 Mar 2006 09:54:10 +0000 Subject: [PATCH 001/306] [PATCH] cpufreq_conservative: aligning of codebase with ondemand Since the conservative govenor was released its codebase has drifted from the the direction and updates that have been applied to the ondemand govornor. This patch addresses the lack of updates in that period and brings conservative back up to date. The resulting diff file between cpufreq_ondemand.c and cpufreq_conservative.c is now much smaller and shows more clearly the differences between the two. Another reason to do this is ages ago, knowingly, I did a piss poor attempt at making conservative less responsive by knocking up DEF_SAMPLING_RATE_LATENCY_MULTIPLIER by two orders of magnitude. I did fix this ages ago but in my dis-organisation I must have toasted the diff and left it the way it was. About two weeks ago a user contacted me saying he was having problems with the conservative governor with his AMD Athlon XP-M 2800+ as /sys/devices/system/cpu/cpu0/cpufreq/conservative showed sampling_rate_min 9950000 sampling_rate_max 1360065408 Nine seconds to decide about changing the frequency....not too responsive :) Signed-off-by: Alexander Clouter Signed-off-by: Dominik Brodowski --- drivers/cpufreq/cpufreq_conservative.c | 53 +++++++++++++------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index ac38766b2583..adecd31f6156 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -35,12 +35,7 @@ */ #define DEF_FREQUENCY_UP_THRESHOLD (80) -#define MIN_FREQUENCY_UP_THRESHOLD (0) -#define MAX_FREQUENCY_UP_THRESHOLD (100) - #define DEF_FREQUENCY_DOWN_THRESHOLD (20) -#define MIN_FREQUENCY_DOWN_THRESHOLD (0) -#define MAX_FREQUENCY_DOWN_THRESHOLD (100) /* * The polling frequency of this governor depends on the capability of @@ -53,10 +48,14 @@ * All times here are in uS. */ static unsigned int def_sampling_rate; -#define MIN_SAMPLING_RATE (def_sampling_rate / 2) +#define MIN_SAMPLING_RATE_RATIO (2) +/* for correct statistics, we need at least 10 ticks between each measure */ +#define MIN_STAT_SAMPLING_RATE (MIN_SAMPLING_RATE_RATIO * jiffies_to_usecs(10)) +#define MIN_SAMPLING_RATE (def_sampling_rate / MIN_SAMPLING_RATE_RATIO) #define MAX_SAMPLING_RATE (500 * def_sampling_rate) -#define DEF_SAMPLING_RATE_LATENCY_MULTIPLIER (100000) -#define DEF_SAMPLING_DOWN_FACTOR (5) +#define DEF_SAMPLING_RATE_LATENCY_MULTIPLIER (1000) +#define DEF_SAMPLING_DOWN_FACTOR (1) +#define MAX_SAMPLING_DOWN_FACTOR (10) #define TRANSITION_LATENCY_LIMIT (10 * 1000) static void do_dbs_timer(void *data); @@ -136,7 +135,7 @@ static ssize_t store_sampling_down_factor(struct cpufreq_policy *unused, unsigned int input; int ret; ret = sscanf (buf, "%u", &input); - if (ret != 1 ) + if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1) return -EINVAL; mutex_lock(&dbs_mutex); @@ -173,8 +172,7 @@ static ssize_t store_up_threshold(struct cpufreq_policy *unused, ret = sscanf (buf, "%u", &input); mutex_lock(&dbs_mutex); - if (ret != 1 || input > MAX_FREQUENCY_UP_THRESHOLD || - input < MIN_FREQUENCY_UP_THRESHOLD || + if (ret != 1 || input > 100 || input < 0 || input <= dbs_tuners_ins.down_threshold) { mutex_unlock(&dbs_mutex); return -EINVAL; @@ -194,8 +192,7 @@ static ssize_t store_down_threshold(struct cpufreq_policy *unused, ret = sscanf (buf, "%u", &input); mutex_lock(&dbs_mutex); - if (ret != 1 || input > MAX_FREQUENCY_DOWN_THRESHOLD || - input < MIN_FREQUENCY_DOWN_THRESHOLD || + if (ret != 1 || input > 100 || input < 0 || input >= dbs_tuners_ins.up_threshold) { mutex_unlock(&dbs_mutex); return -EINVAL; @@ -337,7 +334,6 @@ static void dbs_check_cpu(int cpu) */ /* Check for frequency increase */ - idle_ticks = UINT_MAX; for_each_cpu_mask(j, policy->cpus) { unsigned int tmp_idle_ticks, total_idle_ticks; @@ -357,7 +353,7 @@ static void dbs_check_cpu(int cpu) /* Scale idle ticks by 100 and compare with up and down ticks */ idle_ticks *= 100; up_idle_ticks = (100 - dbs_tuners_ins.up_threshold) * - usecs_to_jiffies(dbs_tuners_ins.sampling_rate); + usecs_to_jiffies(dbs_tuners_ins.sampling_rate); if (idle_ticks < up_idle_ticks) { down_skip[cpu] = 0; @@ -398,6 +394,7 @@ static void dbs_check_cpu(int cpu) struct cpu_dbs_info_s *j_dbs_info; j_dbs_info = &per_cpu(cpu_dbs_info, j); + /* Check for frequency decrease */ total_idle_ticks = j_dbs_info->prev_cpu_idle_up; tmp_idle_ticks = total_idle_ticks - j_dbs_info->prev_cpu_idle_down; @@ -414,12 +411,14 @@ static void dbs_check_cpu(int cpu) freq_down_sampling_rate = dbs_tuners_ins.sampling_rate * dbs_tuners_ins.sampling_down_factor; down_idle_ticks = (100 - dbs_tuners_ins.down_threshold) * - usecs_to_jiffies(freq_down_sampling_rate); + usecs_to_jiffies(freq_down_sampling_rate); if (idle_ticks > down_idle_ticks) { - /* if we are already at the lowest speed then break out early + /* + * if we are already at the lowest speed then break out early * or if we 'cannot' reduce the speed as the user might want - * freq_step to be zero */ + * freq_step to be zero + */ if (requested_freq[cpu] == policy->min || dbs_tuners_ins.freq_step == 0) return; @@ -434,9 +433,8 @@ static void dbs_check_cpu(int cpu) if (requested_freq[cpu] < policy->min) requested_freq[cpu] = policy->min; - __cpufreq_driver_target(policy, - requested_freq[cpu], - CPUFREQ_RELATION_H); + __cpufreq_driver_target(policy, requested_freq[cpu], + CPUFREQ_RELATION_H); return; } } @@ -507,13 +505,16 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, if (dbs_enable == 1) { unsigned int latency; /* policy latency is in nS. Convert it to uS first */ + latency = policy->cpuinfo.transition_latency / 1000; + if (latency == 0) + latency = 1; - latency = policy->cpuinfo.transition_latency; - if (latency < 1000) - latency = 1000; - - def_sampling_rate = (latency / 1000) * + def_sampling_rate = latency * DEF_SAMPLING_RATE_LATENCY_MULTIPLIER; + + if (def_sampling_rate < MIN_STAT_SAMPLING_RATE) + def_sampling_rate = MIN_STAT_SAMPLING_RATE; + dbs_tuners_ins.sampling_rate = def_sampling_rate; dbs_tuners_ins.ignore_nice = 0; dbs_tuners_ins.freq_step = 5; From e8a02572252f9115c2b8296c40fd8b985f06f872 Mon Sep 17 00:00:00 2001 From: Alexander Clouter Date: Wed, 22 Mar 2006 09:56:23 +0000 Subject: [PATCH 002/306] [PATCH] cpufreq_conservative: alter default responsiveness The sensible approach to making conservative less responsive than ondemand :) As mentioned in patch [1/4]. We do not want conservative to shoot through all the frequencies, its point (by default) is to slowly move through them. By default its ten times less responsive. Signed-off-by: Alexander Clouter Signed-off-by: Dominik Brodowski --- drivers/cpufreq/cpufreq_conservative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index adecd31f6156..3ca3cf061642 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -509,7 +509,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, if (latency == 0) latency = 1; - def_sampling_rate = latency * + def_sampling_rate = 10 * latency * DEF_SAMPLING_RATE_LATENCY_MULTIPLIER; if (def_sampling_rate < MIN_STAT_SAMPLING_RATE) From 08a28e2e98aa821cf6f15f8a267beb2f33377bb9 Mon Sep 17 00:00:00 2001 From: Alexander Clouter Date: Wed, 22 Mar 2006 09:59:16 +0000 Subject: [PATCH 003/306] [PATCH] cpufreq_conservative: make for_each_cpu() safe All these changes should make cpufreq_conservative safe in regards to the x86 for_each_cpu cpumask.h changes and whatnot. Whilst making it safe a number of pointless for loops related to the cpu mask's were removed. I was never comfortable with all those for loops, especially as the iteration is over the same data again and again for each CPU you had in a single poll, an O(n^2) outcome to frequency scaling. The approach I use is to assume by default no CPU's exist and it sets the requested_freq to zero as a kind of flag, the reasoning is in the source ;) If the CPU is queried and requested_freq is zero then it initialises the variable to current_freq and then continues as if nothing happened which should be the same net effect as before? Signed-off-by: Alexander Clouter Signed-off-by: Dominik Brodowski --- drivers/cpufreq/cpufreq_conservative.c | 89 ++++++++++++-------------- 1 file changed, 41 insertions(+), 48 deletions(-) diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 3ca3cf061642..7498f2506ade 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -294,31 +294,40 @@ static struct attribute_group dbs_attr_group = { static void dbs_check_cpu(int cpu) { unsigned int idle_ticks, up_idle_ticks, down_idle_ticks; + unsigned int tmp_idle_ticks, total_idle_ticks; unsigned int freq_step; unsigned int freq_down_sampling_rate; - static int down_skip[NR_CPUS]; - static int requested_freq[NR_CPUS]; - static unsigned short init_flag = 0; - struct cpu_dbs_info_s *this_dbs_info; - struct cpu_dbs_info_s *dbs_info; - + static unsigned short down_skip[NR_CPUS]; + static unsigned int requested_freq[NR_CPUS]; + static unsigned int init_flag = NR_CPUS; + struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cpu_dbs_info, cpu); struct cpufreq_policy *policy; - unsigned int j; - this_dbs_info = &per_cpu(cpu_dbs_info, cpu); if (!this_dbs_info->enable) return; - policy = this_dbs_info->cur_policy; - - if ( init_flag == 0 ) { - for_each_online_cpu(j) { - dbs_info = &per_cpu(cpu_dbs_info, j); - requested_freq[j] = dbs_info->cur_policy->cur; + if ( init_flag != 0 ) { + for_each_cpu(init_flag) { + down_skip[init_flag] = 0; + /* I doubt a CPU exists with a freq of 0hz :) */ + requested_freq[init_flag] = 0; } - init_flag = 1; + init_flag = 0; } + /* + * If its a freshly initialised cpu we setup requested_freq. This + * check could be avoided if we did not care about a first time + * stunted increase in CPU speed when there is a load. I feel we + * should be initialising this to something. The removal of a CPU + * is not a problem, after a short time the CPU should settle down + * to a 'natural' frequency. + */ + if (requested_freq[cpu] == 0) + requested_freq[cpu] = this_dbs_info->cur_policy->cur; + + policy = this_dbs_info->cur_policy; + /* * The default safe range is 20% to 80% * Every sampling_rate, we check @@ -335,20 +344,15 @@ static void dbs_check_cpu(int cpu) /* Check for frequency increase */ idle_ticks = UINT_MAX; - for_each_cpu_mask(j, policy->cpus) { - unsigned int tmp_idle_ticks, total_idle_ticks; - struct cpu_dbs_info_s *j_dbs_info; - j_dbs_info = &per_cpu(cpu_dbs_info, j); - /* Check for frequency increase */ - total_idle_ticks = get_cpu_idle_time(j); - tmp_idle_ticks = total_idle_ticks - - j_dbs_info->prev_cpu_idle_up; - j_dbs_info->prev_cpu_idle_up = total_idle_ticks; + /* Check for frequency increase */ + total_idle_ticks = get_cpu_idle_time(cpu); + tmp_idle_ticks = total_idle_ticks - + this_dbs_info->prev_cpu_idle_up; + this_dbs_info->prev_cpu_idle_up = total_idle_ticks; - if (tmp_idle_ticks < idle_ticks) - idle_ticks = tmp_idle_ticks; - } + if (tmp_idle_ticks < idle_ticks) + idle_ticks = tmp_idle_ticks; /* Scale idle ticks by 100 and compare with up and down ticks */ idle_ticks *= 100; @@ -357,13 +361,9 @@ static void dbs_check_cpu(int cpu) if (idle_ticks < up_idle_ticks) { down_skip[cpu] = 0; - for_each_cpu_mask(j, policy->cpus) { - struct cpu_dbs_info_s *j_dbs_info; + this_dbs_info->prev_cpu_idle_down = + this_dbs_info->prev_cpu_idle_up; - j_dbs_info = &per_cpu(cpu_dbs_info, j); - j_dbs_info->prev_cpu_idle_down = - j_dbs_info->prev_cpu_idle_up; - } /* if we are already at full speed then break out early */ if (requested_freq[cpu] == policy->max) return; @@ -388,21 +388,14 @@ static void dbs_check_cpu(int cpu) if (down_skip[cpu] < dbs_tuners_ins.sampling_down_factor) return; - idle_ticks = UINT_MAX; - for_each_cpu_mask(j, policy->cpus) { - unsigned int tmp_idle_ticks, total_idle_ticks; - struct cpu_dbs_info_s *j_dbs_info; + /* Check for frequency decrease */ + total_idle_ticks = this_dbs_info->prev_cpu_idle_up; + tmp_idle_ticks = total_idle_ticks - + this_dbs_info->prev_cpu_idle_down; + this_dbs_info->prev_cpu_idle_down = total_idle_ticks; - j_dbs_info = &per_cpu(cpu_dbs_info, j); - /* Check for frequency decrease */ - total_idle_ticks = j_dbs_info->prev_cpu_idle_up; - tmp_idle_ticks = total_idle_ticks - - j_dbs_info->prev_cpu_idle_down; - j_dbs_info->prev_cpu_idle_down = total_idle_ticks; - - if (tmp_idle_ticks < idle_ticks) - idle_ticks = tmp_idle_ticks; - } + if (tmp_idle_ticks < idle_ticks) + idle_ticks = tmp_idle_ticks; /* Scale idle ticks by 100 and compare with up and down ticks */ idle_ticks *= 100; @@ -491,7 +484,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, j_dbs_info = &per_cpu(cpu_dbs_info, j); j_dbs_info->cur_policy = policy; - j_dbs_info->prev_cpu_idle_up = get_cpu_idle_time(j); + j_dbs_info->prev_cpu_idle_up = get_cpu_idle_time(cpu); j_dbs_info->prev_cpu_idle_down = j_dbs_info->prev_cpu_idle_up; } From a159b82770ab84e1b5e0306fa65e158188492b16 Mon Sep 17 00:00:00 2001 From: Alexander Clouter Date: Wed, 22 Mar 2006 10:00:18 +0000 Subject: [PATCH 004/306] [PATCH] cpufreq_conservative: alternative initialise approach Venki, author of cpufreq_ondemand, came up with a neater way to remove the initialiser code from the main loop of my code and out to the point when the governor is actually initialised. Not only does it look but it also feels cleaner, plus its simpler to understand. It also saves a bunch of pointless conditional statements in the main loop. Signed-off-by: Alexander Clouter Signed-off-by: Dominik Brodowski --- drivers/cpufreq/cpufreq_conservative.c | 55 +++++++++----------------- 1 file changed, 18 insertions(+), 37 deletions(-) diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 7498f2506ade..a152d2c46be7 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -65,6 +65,8 @@ struct cpu_dbs_info_s { unsigned int prev_cpu_idle_up; unsigned int prev_cpu_idle_down; unsigned int enable; + unsigned int down_skip; + unsigned int requested_freq; }; static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); @@ -297,35 +299,12 @@ static void dbs_check_cpu(int cpu) unsigned int tmp_idle_ticks, total_idle_ticks; unsigned int freq_step; unsigned int freq_down_sampling_rate; - static unsigned short down_skip[NR_CPUS]; - static unsigned int requested_freq[NR_CPUS]; - static unsigned int init_flag = NR_CPUS; struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cpu_dbs_info, cpu); struct cpufreq_policy *policy; if (!this_dbs_info->enable) return; - if ( init_flag != 0 ) { - for_each_cpu(init_flag) { - down_skip[init_flag] = 0; - /* I doubt a CPU exists with a freq of 0hz :) */ - requested_freq[init_flag] = 0; - } - init_flag = 0; - } - - /* - * If its a freshly initialised cpu we setup requested_freq. This - * check could be avoided if we did not care about a first time - * stunted increase in CPU speed when there is a load. I feel we - * should be initialising this to something. The removal of a CPU - * is not a problem, after a short time the CPU should settle down - * to a 'natural' frequency. - */ - if (requested_freq[cpu] == 0) - requested_freq[cpu] = this_dbs_info->cur_policy->cur; - policy = this_dbs_info->cur_policy; /* @@ -360,12 +339,12 @@ static void dbs_check_cpu(int cpu) usecs_to_jiffies(dbs_tuners_ins.sampling_rate); if (idle_ticks < up_idle_ticks) { - down_skip[cpu] = 0; + this_dbs_info->down_skip = 0; this_dbs_info->prev_cpu_idle_down = this_dbs_info->prev_cpu_idle_up; /* if we are already at full speed then break out early */ - if (requested_freq[cpu] == policy->max) + if (this_dbs_info->requested_freq == policy->max) return; freq_step = (dbs_tuners_ins.freq_step * policy->max) / 100; @@ -374,18 +353,18 @@ static void dbs_check_cpu(int cpu) if (unlikely(freq_step == 0)) freq_step = 5; - requested_freq[cpu] += freq_step; - if (requested_freq[cpu] > policy->max) - requested_freq[cpu] = policy->max; + this_dbs_info->requested_freq += freq_step; + if (this_dbs_info->requested_freq > policy->max) + this_dbs_info->requested_freq = policy->max; - __cpufreq_driver_target(policy, requested_freq[cpu], + __cpufreq_driver_target(policy, this_dbs_info->requested_freq, CPUFREQ_RELATION_H); return; } /* Check for frequency decrease */ - down_skip[cpu]++; - if (down_skip[cpu] < dbs_tuners_ins.sampling_down_factor) + this_dbs_info->down_skip++; + if (this_dbs_info->down_skip < dbs_tuners_ins.sampling_down_factor) return; /* Check for frequency decrease */ @@ -399,7 +378,7 @@ static void dbs_check_cpu(int cpu) /* Scale idle ticks by 100 and compare with up and down ticks */ idle_ticks *= 100; - down_skip[cpu] = 0; + this_dbs_info->down_skip = 0; freq_down_sampling_rate = dbs_tuners_ins.sampling_rate * dbs_tuners_ins.sampling_down_factor; @@ -412,7 +391,7 @@ static void dbs_check_cpu(int cpu) * or if we 'cannot' reduce the speed as the user might want * freq_step to be zero */ - if (requested_freq[cpu] == policy->min + if (this_dbs_info->requested_freq == policy->min || dbs_tuners_ins.freq_step == 0) return; @@ -422,11 +401,11 @@ static void dbs_check_cpu(int cpu) if (unlikely(freq_step == 0)) freq_step = 5; - requested_freq[cpu] -= freq_step; - if (requested_freq[cpu] < policy->min) - requested_freq[cpu] = policy->min; + this_dbs_info->requested_freq -= freq_step; + if (this_dbs_info->requested_freq < policy->min) + this_dbs_info->requested_freq = policy->min; - __cpufreq_driver_target(policy, requested_freq[cpu], + __cpufreq_driver_target(policy, this_dbs_info->requested_freq, CPUFREQ_RELATION_H); return; } @@ -489,6 +468,8 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, = j_dbs_info->prev_cpu_idle_up; } this_dbs_info->enable = 1; + this_dbs_info->down_skip = 0; + this_dbs_info->requested_freq = policy->cur; sysfs_create_group(&policy->kobj, &dbs_attr_group); dbs_enable++; /* From ff8c288d7d1a368b663058cdee1ea0adcdef2fa2 Mon Sep 17 00:00:00 2001 From: Eric Piel Date: Fri, 10 Mar 2006 11:34:16 +0200 Subject: [PATCH 005/306] [PATCH] cpufreq_ondemand: Warn if it cannot run due to too long transition latency Display a warning if the ondemand governor can not be selected due to a transition latency of the cpufreq driver which is too long. Signed-off-by: Eric Piel Acked-by: Venkatesh Pallipadi Signed-off-by: Dominik Brodowski --- drivers/cpufreq/cpufreq_ondemand.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 69aa1db8336c..6430489db6f4 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -395,8 +395,11 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, return -EINVAL; if (policy->cpuinfo.transition_latency > - (TRANSITION_LATENCY_LIMIT * 1000)) + (TRANSITION_LATENCY_LIMIT * 1000)) { + printk(KERN_WARNING "ondemand governor failed to load " + "due to too long transition latency\n"); return -EINVAL; + } if (this_dbs_info->enable) /* Already enabled */ break; From 9cbad61b41f0b6f0a4c600fe96d8292ffd592b50 Mon Sep 17 00:00:00 2001 From: Eric Piel Date: Fri, 10 Mar 2006 11:35:27 +0200 Subject: [PATCH 006/306] [PATCH] cpufreq_ondemand: keep ignore_nice_load value when it is reselected Keep the value of ignore_nice_load of the ondemand governor even after the governor has been deselected and selected back. This is the behavior of the other exported values of the ondemand governor and it's much more user-friendly. Signed-off-by: Eric Piel Acked-by: Venkatesh Pallipadi Signed-off-by: Dominik Brodowski --- drivers/cpufreq/cpufreq_ondemand.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 6430489db6f4..cd846f57147e 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -84,6 +84,7 @@ struct dbs_tuners { static struct dbs_tuners dbs_tuners_ins = { .up_threshold = DEF_FREQUENCY_UP_THRESHOLD, .sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR, + .ignore_nice = 0, }; static inline unsigned int get_cpu_idle_time(unsigned int cpu) @@ -434,8 +435,6 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, def_sampling_rate = MIN_STAT_SAMPLING_RATE; dbs_tuners_ins.sampling_rate = def_sampling_rate; - dbs_tuners_ins.ignore_nice = 0; - dbs_timer_init(); } From 7c9d8c0e84d395a01289ebd1597758939a875a86 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sun, 26 Mar 2006 11:11:03 +0200 Subject: [PATCH 007/306] [PATCH] cpufreq_ondemand: add range check Assert that cpufreq_target is, at least, called with the minimum frequency allowed by this policy, not something lower. It triggered problems on ARM. Signed-off-by: Dominik Brodowski --- drivers/cpufreq/cpufreq_ondemand.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index cd846f57147e..956d121cb161 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -351,6 +351,9 @@ static void dbs_check_cpu(int cpu) freq_next = (freq_next * policy->cur) / (dbs_tuners_ins.up_threshold - 10); + if (freq_next < policy->min) + freq_next = policy->min; + if (freq_next <= ((policy->cur * 95) / 100)) __cpufreq_driver_target(policy, freq_next, CPUFREQ_RELATION_L); } From 8faaea3faa5ed0b2a15afb7b4e57ce0cd8dbe4ef Mon Sep 17 00:00:00 2001 From: Freddy Spierenburg Date: Sun, 26 Mar 2006 21:22:51 +0100 Subject: [PATCH 008/306] [SERIAL] Small time UART configuration fix for AU1100 processor The AU1100 processor does not have an internal UART2. Only UART0, UART1 and UART3 exist. This patch removes the non existing UART2 and replaces it with a descriptive comment. Signed-off-by: Freddy Spierenburg Signed-off-by: Russell King --- drivers/serial/8250_au1x00.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/serial/8250_au1x00.c b/drivers/serial/8250_au1x00.c index 8d8d7a70d03e..3d1bfd07208d 100644 --- a/drivers/serial/8250_au1x00.c +++ b/drivers/serial/8250_au1x00.c @@ -51,7 +51,7 @@ static struct plat_serial8250_port au1x00_data[] = { #elif defined(CONFIG_SOC_AU1100) PORT(UART0_ADDR, AU1100_UART0_INT), PORT(UART1_ADDR, AU1100_UART1_INT), - PORT(UART2_ADDR, AU1100_UART2_INT), + /* The internal UART2 does not exist on the AU1100 processor. */ PORT(UART3_ADDR, AU1100_UART3_INT), #elif defined(CONFIG_SOC_AU1550) PORT(UART0_ADDR, AU1550_UART0_INT), From 335bd9dff31d042b773591933d3ee5bd62d5ea27 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Sun, 26 Mar 2006 21:25:57 +0100 Subject: [PATCH 009/306] [SERIAL] Remove obsoleted au1x00_uart driver As announced in feature-removal-schedule.txt. Signed-off-by: Ralf Baechle Signed-off-by: Russell King --- arch/mips/au1000/common/setup.c | 2 +- drivers/serial/Kconfig | 16 - drivers/serial/Makefile | 1 - drivers/serial/au1x00_uart.c | 1287 ------------------------------- include/asm-mips/serial.h | 85 +- 5 files changed, 2 insertions(+), 1389 deletions(-) delete mode 100644 drivers/serial/au1x00_uart.c diff --git a/arch/mips/au1000/common/setup.c b/arch/mips/au1000/common/setup.c index 1080558c8100..307e98c29ddc 100644 --- a/arch/mips/au1000/common/setup.c +++ b/arch/mips/au1000/common/setup.c @@ -94,7 +94,7 @@ void __init plat_setup(void) argptr = prom_getcmdline(); -#if defined(CONFIG_SERIAL_AU1X00_CONSOLE) || defined(CONFIG_SERIAL_8250_CONSOLE) +#ifdef CONFIG_SERIAL_8250_CONSOLE if ((argptr = strstr(argptr, "console=")) == NULL) { argptr = prom_getcmdline(); strcat(argptr, " console=ttyS0,115200"); diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig index ceb3697bf84d..fe0d8b8e91c8 100644 --- a/drivers/serial/Kconfig +++ b/drivers/serial/Kconfig @@ -620,22 +620,6 @@ config SERIAL_SH_SCI_CONSOLE depends on SERIAL_SH_SCI=y select SERIAL_CORE_CONSOLE -config SERIAL_AU1X00 - bool "Enable Au1x00 UART Support" - depends on MIPS && SOC_AU1X00 - select SERIAL_CORE - help - If you have an Alchemy AU1X00 processor (MIPS based) and you want - to use serial ports, say Y. Otherwise, say N. - -config SERIAL_AU1X00_CONSOLE - bool "Enable Au1x00 serial console" - depends on SERIAL_AU1X00 - select SERIAL_CORE_CONSOLE - help - If you have an Alchemy AU1X00 processor (MIPS based) and you want - to use a console on a serial port, say Y. Otherwise, say N. - config SERIAL_CORE tristate diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile index a3a4323d9c86..d2b4c214876b 100644 --- a/drivers/serial/Makefile +++ b/drivers/serial/Makefile @@ -41,7 +41,6 @@ obj-$(CONFIG_SERIAL_COLDFIRE) += mcfserial.o obj-$(CONFIG_V850E_UART) += v850e_uart.o obj-$(CONFIG_SERIAL_PMACZILOG) += pmac_zilog.o obj-$(CONFIG_SERIAL_LH7A40X) += serial_lh7a40x.o -obj-$(CONFIG_SERIAL_AU1X00) += au1x00_uart.o obj-$(CONFIG_SERIAL_DZ) += dz.o obj-$(CONFIG_SERIAL_SH_SCI) += sh-sci.o obj-$(CONFIG_SERIAL_SGI_L1_CONSOLE) += sn_console.o diff --git a/drivers/serial/au1x00_uart.c b/drivers/serial/au1x00_uart.c deleted file mode 100644 index 948880ac5878..000000000000 --- a/drivers/serial/au1x00_uart.c +++ /dev/null @@ -1,1287 +0,0 @@ -/* - * Driver for 8250/16550-type serial ports - * - * Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o. - * - * Copyright (C) 2001 Russell King. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * A note about mapbase / membase - * - * mapbase is the physical address of the IO port. Currently, we don't - * support this very well, and it may well be dropped from this driver - * in future. As such, mapbase should be NULL. - * - * membase is an 'ioremapped' cookie. This is compatible with the old - * serial.c driver, and is currently the preferred form. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#if defined(CONFIG_SERIAL_AU1X00_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ) -#define SUPPORT_SYSRQ -#endif - -#include -#include "8250.h" - -/* - * Debugging. - */ -#if 0 -#define DEBUG_AUTOCONF(fmt...) printk(fmt) -#else -#define DEBUG_AUTOCONF(fmt...) do { } while (0) -#endif - -#if 0 -#define DEBUG_INTR(fmt...) printk(fmt) -#else -#define DEBUG_INTR(fmt...) do { } while (0) -#endif - -#define PASS_LIMIT 256 - -/* - * We default to IRQ0 for the "no irq" hack. Some - * machine types want others as well - they're free - * to redefine this in their header file. - */ -#define is_real_interrupt(irq) ((irq) != 0) - -static struct old_serial_port old_serial_port[] = { - { .baud_base = 0, - .iomem_base = (u8 *)UART0_ADDR, - .irq = AU1000_UART0_INT, - .flags = STD_COM_FLAGS, - .iomem_reg_shift = 2, - }, { - .baud_base = 0, - .iomem_base = (u8 *)UART1_ADDR, - .irq = AU1000_UART1_INT, - .flags = STD_COM_FLAGS, - .iomem_reg_shift = 2 - }, { - .baud_base = 0, - .iomem_base = (u8 *)UART2_ADDR, - .irq = AU1000_UART2_INT, - .flags = STD_COM_FLAGS, - .iomem_reg_shift = 2 - }, { - .baud_base = 0, - .iomem_base = (u8 *)UART3_ADDR, - .irq = AU1000_UART3_INT, - .flags = STD_COM_FLAGS, - .iomem_reg_shift = 2 - } -}; - -#define UART_NR ARRAY_SIZE(old_serial_port) - -struct uart_8250_port { - struct uart_port port; - struct timer_list timer; /* "no irq" timer */ - struct list_head list; /* ports on this IRQ */ - unsigned short rev; - unsigned char acr; - unsigned char ier; - unsigned char lcr; - unsigned char mcr_mask; /* mask of user bits */ - unsigned char mcr_force; /* mask of forced bits */ - unsigned char lsr_break_flag; - - /* - * We provide a per-port pm hook. - */ - void (*pm)(struct uart_port *port, - unsigned int state, unsigned int old); -}; - -struct irq_info { - spinlock_t lock; - struct list_head *head; -}; - -static struct irq_info irq_lists[NR_IRQS]; - -/* - * Here we define the default xmit fifo size used for each type of UART. - */ -static const struct serial_uart_config uart_config[PORT_MAX_8250+1] = { - { "unknown", 1, 0 }, - { "8250", 1, 0 }, - { "16450", 1, 0 }, - { "16550", 1, 0 }, - /* PORT_16550A */ - { "AU1X00_UART",16, UART_CLEAR_FIFO | UART_USE_FIFO }, -}; - -static unsigned int serial_in(struct uart_8250_port *up, int offset) -{ - return au_readl((unsigned long)up->port.membase + offset); -} - -static void serial_out(struct uart_8250_port *up, int offset, int value) -{ - au_writel(value, (unsigned long)up->port.membase + offset); -} - -#define serial_inp(up, offset) serial_in(up, offset) -#define serial_outp(up, offset, value) serial_out(up, offset, value) - -/* - * This routine is called by rs_init() to initialize a specific serial - * port. It determines what type of UART chip this serial port is - * using: 8250, 16450, 16550, 16550A. The important question is - * whether or not this UART is a 16550A or not, since this will - * determine whether or not we can use its FIFO features or not. - */ -static void autoconfig(struct uart_8250_port *up, unsigned int probeflags) -{ - unsigned char save_lcr, save_mcr; - unsigned long flags; - - if (!up->port.iobase && !up->port.mapbase && !up->port.membase) - return; - - DEBUG_AUTOCONF("ttyS%d: autoconf (0x%04x, 0x%08lx): ", - up->port.line, up->port.iobase, up->port.membase); - - /* - * We really do need global IRQs disabled here - we're going to - * be frobbing the chips IRQ enable register to see if it exists. - */ - spin_lock_irqsave(&up->port.lock, flags); -// save_flags(flags); cli(); - - save_mcr = serial_in(up, UART_MCR); - save_lcr = serial_in(up, UART_LCR); - - up->port.type = PORT_16550A; - serial_outp(up, UART_LCR, save_lcr); - - up->port.fifosize = uart_config[up->port.type].dfl_xmit_fifo_size; - - if (up->port.type == PORT_UNKNOWN) - goto out; - - /* - * Reset the UART. - */ - serial_outp(up, UART_MCR, save_mcr); - serial_outp(up, UART_FCR, (UART_FCR_ENABLE_FIFO | - UART_FCR_CLEAR_RCVR | - UART_FCR_CLEAR_XMIT)); - serial_outp(up, UART_FCR, 0); - (void)serial_in(up, UART_RX); - serial_outp(up, UART_IER, 0); - - out: - spin_unlock_irqrestore(&up->port.lock, flags); -// restore_flags(flags); - DEBUG_AUTOCONF("type=%s\n", uart_config[up->port.type].name); -} - -static void serial8250_stop_tx(struct uart_port *port) -{ - struct uart_8250_port *up = (struct uart_8250_port *)port; - - if (up->ier & UART_IER_THRI) { - up->ier &= ~UART_IER_THRI; - serial_out(up, UART_IER, up->ier); - } -} - -static void serial8250_start_tx(struct uart_port *port) -{ - struct uart_8250_port *up = (struct uart_8250_port *)port; - - if (!(up->ier & UART_IER_THRI)) { - up->ier |= UART_IER_THRI; - serial_out(up, UART_IER, up->ier); - } -} - -static void serial8250_stop_rx(struct uart_port *port) -{ - struct uart_8250_port *up = (struct uart_8250_port *)port; - - up->ier &= ~UART_IER_RLSI; - up->port.read_status_mask &= ~UART_LSR_DR; - serial_out(up, UART_IER, up->ier); -} - -static void serial8250_enable_ms(struct uart_port *port) -{ - struct uart_8250_port *up = (struct uart_8250_port *)port; - - up->ier |= UART_IER_MSI; - serial_out(up, UART_IER, up->ier); -} - -static void -receive_chars(struct uart_8250_port *up, int *status, struct pt_regs *regs) -{ - struct tty_struct *tty = up->port.info->tty; - unsigned char ch, flag; - int max_count = 256; - - do { - ch = serial_inp(up, UART_RX); - flag = TTY_NORMAL; - up->port.icount.rx++; - - if (unlikely(*status & (UART_LSR_BI | UART_LSR_PE | - UART_LSR_FE | UART_LSR_OE))) { - /* - * For statistics only - */ - if (*status & UART_LSR_BI) { - *status &= ~(UART_LSR_FE | UART_LSR_PE); - up->port.icount.brk++; - /* - * We do the SysRQ and SAK checking - * here because otherwise the break - * may get masked by ignore_status_mask - * or read_status_mask. - */ - if (uart_handle_break(&up->port)) - goto ignore_char; - } else if (*status & UART_LSR_PE) - up->port.icount.parity++; - else if (*status & UART_LSR_FE) - up->port.icount.frame++; - if (*status & UART_LSR_OE) - up->port.icount.overrun++; - - /* - * Mask off conditions which should be ingored. - */ - *status &= up->port.read_status_mask; - -#ifdef CONFIG_SERIAL_AU1X00_CONSOLE - if (up->port.line == up->port.cons->index) { - /* Recover the break flag from console xmit */ - *status |= up->lsr_break_flag; - up->lsr_break_flag = 0; - } -#endif - if (*status & UART_LSR_BI) { - DEBUG_INTR("handling break...."); - flag = TTY_BREAK; - } else if (*status & UART_LSR_PE) - flag = TTY_PARITY; - else if (*status & UART_LSR_FE) - flag = TTY_FRAME; - } - if (uart_handle_sysrq_char(&up->port, ch, regs)) - goto ignore_char; - if ((*status & up->port.ignore_status_mask) == 0) - tty_insert_flip_char(tty, ch, flag); - if (*status & UART_LSR_OE) - /* - * Overrun is special, since it's reported - * immediately, and doesn't affect the current - * character. - */ - tty_insert_flip_char(tty, 0, TTY_OVERRUN); - } - ignore_char: - *status = serial_inp(up, UART_LSR); - } while ((*status & UART_LSR_DR) && (max_count-- > 0)); - spin_unlock(&up->port.lock); - tty_flip_buffer_push(tty); - spin_lock(&up->port.lock); -} - -static void transmit_chars(struct uart_8250_port *up) -{ - struct circ_buf *xmit = &up->port.info->xmit; - int count; - - if (up->port.x_char) { - serial_outp(up, UART_TX, up->port.x_char); - up->port.icount.tx++; - up->port.x_char = 0; - return; - } - if (uart_circ_empty(xmit) || uart_tx_stopped(&up->port)) { - serial8250_stop_tx(&up->port); - return; - } - - count = up->port.fifosize; - do { - serial_out(up, UART_TX, xmit->buf[xmit->tail]); - xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1); - up->port.icount.tx++; - if (uart_circ_empty(xmit)) - break; - } while (--count > 0); - - if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) - uart_write_wakeup(&up->port); - - DEBUG_INTR("THRE..."); - - if (uart_circ_empty(xmit)) - serial8250_stop_tx(&up->port); -} - -static void check_modem_status(struct uart_8250_port *up) -{ - int status; - - status = serial_in(up, UART_MSR); - - if ((status & UART_MSR_ANY_DELTA) == 0) - return; - - if (status & UART_MSR_TERI) - up->port.icount.rng++; - if (status & UART_MSR_DDSR) - up->port.icount.dsr++; - if (status & UART_MSR_DDCD) - uart_handle_dcd_change(&up->port, status & UART_MSR_DCD); - if (status & UART_MSR_DCTS) - uart_handle_cts_change(&up->port, status & UART_MSR_CTS); - - wake_up_interruptible(&up->port.info->delta_msr_wait); -} - -/* - * This handles the interrupt from one port. - */ -static inline void -serial8250_handle_port(struct uart_8250_port *up, struct pt_regs *regs) -{ - unsigned int status = serial_inp(up, UART_LSR); - - DEBUG_INTR("status = %x...", status); - - if (status & UART_LSR_DR) - receive_chars(up, &status, regs); - check_modem_status(up); - if (status & UART_LSR_THRE) - transmit_chars(up); -} - -/* - * This is the serial driver's interrupt routine. - * - * Arjan thinks the old way was overly complex, so it got simplified. - * Alan disagrees, saying that need the complexity to handle the weird - * nature of ISA shared interrupts. (This is a special exception.) - * - * In order to handle ISA shared interrupts properly, we need to check - * that all ports have been serviced, and therefore the ISA interrupt - * line has been de-asserted. - * - * This means we need to loop through all ports. checking that they - * don't have an interrupt pending. - */ -static irqreturn_t serial8250_interrupt(int irq, void *dev_id, struct pt_regs *regs) -{ - struct irq_info *i = dev_id; - struct list_head *l, *end = NULL; - int pass_counter = 0; - - DEBUG_INTR("serial8250_interrupt(%d)...", irq); - - spin_lock(&i->lock); - - l = i->head; - do { - struct uart_8250_port *up; - unsigned int iir; - - up = list_entry(l, struct uart_8250_port, list); - - iir = serial_in(up, UART_IIR); - if (!(iir & UART_IIR_NO_INT)) { - spin_lock(&up->port.lock); - serial8250_handle_port(up, regs); - spin_unlock(&up->port.lock); - - end = NULL; - } else if (end == NULL) - end = l; - - l = l->next; - - if (l == i->head && pass_counter++ > PASS_LIMIT) { - /* If we hit this, we're dead. */ - printk(KERN_ERR "serial8250: too much work for " - "irq%d\n", irq); - break; - } - } while (l != end); - - spin_unlock(&i->lock); - - DEBUG_INTR("end.\n"); - /* FIXME! Was it really ours? */ - return IRQ_HANDLED; -} - -/* - * To support ISA shared interrupts, we need to have one interrupt - * handler that ensures that the IRQ line has been deasserted - * before returning. Failing to do this will result in the IRQ - * line being stuck active, and, since ISA irqs are edge triggered, - * no more IRQs will be seen. - */ -static void serial_do_unlink(struct irq_info *i, struct uart_8250_port *up) -{ - spin_lock_irq(&i->lock); - - if (!list_empty(i->head)) { - if (i->head == &up->list) - i->head = i->head->next; - list_del(&up->list); - } else { - BUG_ON(i->head != &up->list); - i->head = NULL; - } - - spin_unlock_irq(&i->lock); -} - -static int serial_link_irq_chain(struct uart_8250_port *up) -{ - struct irq_info *i = irq_lists + up->port.irq; - int ret, irq_flags = up->port.flags & UPF_SHARE_IRQ ? SA_SHIRQ : 0; - - spin_lock_irq(&i->lock); - - if (i->head) { - list_add(&up->list, i->head); - spin_unlock_irq(&i->lock); - - ret = 0; - } else { - INIT_LIST_HEAD(&up->list); - i->head = &up->list; - spin_unlock_irq(&i->lock); - - ret = request_irq(up->port.irq, serial8250_interrupt, - irq_flags, "serial", i); - if (ret < 0) - serial_do_unlink(i, up); - } - - return ret; -} - -static void serial_unlink_irq_chain(struct uart_8250_port *up) -{ - struct irq_info *i = irq_lists + up->port.irq; - - BUG_ON(i->head == NULL); - - if (list_empty(i->head)) - free_irq(up->port.irq, i); - - serial_do_unlink(i, up); -} - -/* - * This function is used to handle ports that do not have an - * interrupt. This doesn't work very well for 16450's, but gives - * barely passable results for a 16550A. (Although at the expense - * of much CPU overhead). - */ -static void serial8250_timeout(unsigned long data) -{ - struct uart_8250_port *up = (struct uart_8250_port *)data; - unsigned int timeout; - unsigned int iir; - - iir = serial_in(up, UART_IIR); - if (!(iir & UART_IIR_NO_INT)) { - spin_lock(&up->port.lock); - serial8250_handle_port(up, NULL); - spin_unlock(&up->port.lock); - } - - timeout = up->port.timeout; - timeout = timeout > 6 ? (timeout / 2 - 2) : 1; - mod_timer(&up->timer, jiffies + timeout); -} - -static unsigned int serial8250_tx_empty(struct uart_port *port) -{ - struct uart_8250_port *up = (struct uart_8250_port *)port; - unsigned long flags; - unsigned int ret; - - spin_lock_irqsave(&up->port.lock, flags); - ret = serial_in(up, UART_LSR) & UART_LSR_TEMT ? TIOCSER_TEMT : 0; - spin_unlock_irqrestore(&up->port.lock, flags); - - return ret; -} - -static unsigned int serial8250_get_mctrl(struct uart_port *port) -{ - struct uart_8250_port *up = (struct uart_8250_port *)port; - unsigned char status; - unsigned int ret; - - status = serial_in(up, UART_MSR); - - ret = 0; - if (status & UART_MSR_DCD) - ret |= TIOCM_CAR; - if (status & UART_MSR_RI) - ret |= TIOCM_RNG; - if (status & UART_MSR_DSR) - ret |= TIOCM_DSR; - if (status & UART_MSR_CTS) - ret |= TIOCM_CTS; - return ret; -} - -static void serial8250_set_mctrl(struct uart_port *port, unsigned int mctrl) -{ - struct uart_8250_port *up = (struct uart_8250_port *)port; - unsigned char mcr = 0; - - if (mctrl & TIOCM_RTS) - mcr |= UART_MCR_RTS; - if (mctrl & TIOCM_DTR) - mcr |= UART_MCR_DTR; - if (mctrl & TIOCM_OUT1) - mcr |= UART_MCR_OUT1; - if (mctrl & TIOCM_OUT2) - mcr |= UART_MCR_OUT2; - if (mctrl & TIOCM_LOOP) - mcr |= UART_MCR_LOOP; - - mcr = (mcr & up->mcr_mask) | up->mcr_force; - - serial_out(up, UART_MCR, mcr); -} - -static void serial8250_break_ctl(struct uart_port *port, int break_state) -{ - struct uart_8250_port *up = (struct uart_8250_port *)port; - unsigned long flags; - - spin_lock_irqsave(&up->port.lock, flags); - if (break_state == -1) - up->lcr |= UART_LCR_SBC; - else - up->lcr &= ~UART_LCR_SBC; - serial_out(up, UART_LCR, up->lcr); - spin_unlock_irqrestore(&up->port.lock, flags); -} - -static int serial8250_startup(struct uart_port *port) -{ - struct uart_8250_port *up = (struct uart_8250_port *)port; - unsigned long flags; - int retval; - - /* - * Clear the FIFO buffers and disable them. - * (they will be reeanbled in set_termios()) - */ - if (uart_config[up->port.type].flags & UART_CLEAR_FIFO) { - serial_outp(up, UART_FCR, UART_FCR_ENABLE_FIFO); - serial_outp(up, UART_FCR, UART_FCR_ENABLE_FIFO | - UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT); - serial_outp(up, UART_FCR, 0); - } - - /* - * Clear the interrupt registers. - */ - (void) serial_inp(up, UART_LSR); - (void) serial_inp(up, UART_RX); - (void) serial_inp(up, UART_IIR); - (void) serial_inp(up, UART_MSR); - - /* - * At this point, there's no way the LSR could still be 0xff; - * if it is, then bail out, because there's likely no UART - * here. - */ - if (!(up->port.flags & UPF_BUGGY_UART) && - (serial_inp(up, UART_LSR) == 0xff)) { - printk("ttyS%d: LSR safety check engaged!\n", up->port.line); - return -ENODEV; - } - - retval = serial_link_irq_chain(up); - if (retval) - return retval; - - /* - * Now, initialize the UART - */ - serial_outp(up, UART_LCR, UART_LCR_WLEN8); - - spin_lock_irqsave(&up->port.lock, flags); - if (up->port.flags & UPF_FOURPORT) { - if (!is_real_interrupt(up->port.irq)) - up->port.mctrl |= TIOCM_OUT1; - } else - /* - * Most PC uarts need OUT2 raised to enable interrupts. - */ - if (is_real_interrupt(up->port.irq)) - up->port.mctrl |= TIOCM_OUT2; - - serial8250_set_mctrl(&up->port, up->port.mctrl); - spin_unlock_irqrestore(&up->port.lock, flags); - - /* - * Finally, enable interrupts. Note: Modem status interrupts - * are set via set_termios(), which will be occurring imminently - * anyway, so we don't enable them here. - */ - up->ier = UART_IER_RLSI | UART_IER_RDI; - serial_outp(up, UART_IER, up->ier); - - if (up->port.flags & UPF_FOURPORT) { - unsigned int icp; - /* - * Enable interrupts on the AST Fourport board - */ - icp = (up->port.iobase & 0xfe0) | 0x01f; - outb_p(0x80, icp); - (void) inb_p(icp); - } - - /* - * And clear the interrupt registers again for luck. - */ - (void) serial_inp(up, UART_LSR); - (void) serial_inp(up, UART_RX); - (void) serial_inp(up, UART_IIR); - (void) serial_inp(up, UART_MSR); - - return 0; -} - -static void serial8250_shutdown(struct uart_port *port) -{ - struct uart_8250_port *up = (struct uart_8250_port *)port; - unsigned long flags; - - /* - * Disable interrupts from this port - */ - up->ier = 0; - serial_outp(up, UART_IER, 0); - - spin_lock_irqsave(&up->port.lock, flags); - if (up->port.flags & UPF_FOURPORT) { - /* reset interrupts on the AST Fourport board */ - inb((up->port.iobase & 0xfe0) | 0x1f); - up->port.mctrl |= TIOCM_OUT1; - } else - up->port.mctrl &= ~TIOCM_OUT2; - - serial8250_set_mctrl(&up->port, up->port.mctrl); - spin_unlock_irqrestore(&up->port.lock, flags); - - /* - * Disable break condition and FIFOs - */ - serial_out(up, UART_LCR, serial_inp(up, UART_LCR) & ~UART_LCR_SBC); - serial_outp(up, UART_FCR, UART_FCR_ENABLE_FIFO | - UART_FCR_CLEAR_RCVR | - UART_FCR_CLEAR_XMIT); - serial_outp(up, UART_FCR, 0); - - /* - * Read data port to reset things, and then unlink from - * the IRQ chain. - */ - (void) serial_in(up, UART_RX); - - if (!is_real_interrupt(up->port.irq)) - del_timer_sync(&up->timer); - else - serial_unlink_irq_chain(up); -} - -static unsigned int serial8250_get_divisor(struct uart_port *port, unsigned int baud) -{ - unsigned int quot; - - /* - * Handle magic divisors for baud rates above baud_base on - * SMSC SuperIO chips. - */ - if ((port->flags & UPF_MAGIC_MULTIPLIER) && - baud == (port->uartclk/4)) - quot = 0x8001; - else if ((port->flags & UPF_MAGIC_MULTIPLIER) && - baud == (port->uartclk/8)) - quot = 0x8002; - else - quot = uart_get_divisor(port, baud); - - return quot; -} - -static void -serial8250_set_termios(struct uart_port *port, struct termios *termios, - struct termios *old) -{ - struct uart_8250_port *up = (struct uart_8250_port *)port; - unsigned char cval, fcr = 0; - unsigned long flags; - unsigned int baud, quot; - - switch (termios->c_cflag & CSIZE) { - case CS5: - cval = UART_LCR_WLEN5; - break; - case CS6: - cval = UART_LCR_WLEN6; - break; - case CS7: - cval = UART_LCR_WLEN7; - break; - default: - case CS8: - cval = UART_LCR_WLEN8; - break; - } - - if (termios->c_cflag & CSTOPB) - cval |= UART_LCR_STOP; - if (termios->c_cflag & PARENB) - cval |= UART_LCR_PARITY; - if (!(termios->c_cflag & PARODD)) - cval |= UART_LCR_EPAR; -#ifdef CMSPAR - if (termios->c_cflag & CMSPAR) - cval |= UART_LCR_SPAR; -#endif - - /* - * Ask the core to calculate the divisor for us. - */ - baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk/16); - quot = serial8250_get_divisor(port, baud); - quot = 0x35; /* FIXME */ - - /* - * Work around a bug in the Oxford Semiconductor 952 rev B - * chip which causes it to seriously miscalculate baud rates - * when DLL is 0. - */ - if ((quot & 0xff) == 0 && up->port.type == PORT_16C950 && - up->rev == 0x5201) - quot ++; - - if (uart_config[up->port.type].flags & UART_USE_FIFO) { - if (baud < 2400) - fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIGGER_1; - else - fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIGGER_8; - } - - /* - * Ok, we're now changing the port state. Do it with - * interrupts disabled. - */ - spin_lock_irqsave(&up->port.lock, flags); - - /* - * Update the per-port timeout. - */ - uart_update_timeout(port, termios->c_cflag, baud); - - up->port.read_status_mask = UART_LSR_OE | UART_LSR_THRE | UART_LSR_DR; - if (termios->c_iflag & INPCK) - up->port.read_status_mask |= UART_LSR_FE | UART_LSR_PE; - if (termios->c_iflag & (BRKINT | PARMRK)) - up->port.read_status_mask |= UART_LSR_BI; - - /* - * Characteres to ignore - */ - up->port.ignore_status_mask = 0; - if (termios->c_iflag & IGNPAR) - up->port.ignore_status_mask |= UART_LSR_PE | UART_LSR_FE; - if (termios->c_iflag & IGNBRK) { - up->port.ignore_status_mask |= UART_LSR_BI; - /* - * If we're ignoring parity and break indicators, - * ignore overruns too (for real raw support). - */ - if (termios->c_iflag & IGNPAR) - up->port.ignore_status_mask |= UART_LSR_OE; - } - - /* - * ignore all characters if CREAD is not set - */ - if ((termios->c_cflag & CREAD) == 0) - up->port.ignore_status_mask |= UART_LSR_DR; - - /* - * CTS flow control flag and modem status interrupts - */ - up->ier &= ~UART_IER_MSI; - if (UART_ENABLE_MS(&up->port, termios->c_cflag)) - up->ier |= UART_IER_MSI; - - serial_out(up, UART_IER, up->ier); - serial_outp(up, 0x28, quot & 0xffff); - up->lcr = cval; /* Save LCR */ - if (up->port.type != PORT_16750) { - if (fcr & UART_FCR_ENABLE_FIFO) { - /* emulated UARTs (Lucent Venus 167x) need two steps */ - serial_outp(up, UART_FCR, UART_FCR_ENABLE_FIFO); - } - serial_outp(up, UART_FCR, fcr); /* set fcr */ - } - spin_unlock_irqrestore(&up->port.lock, flags); -} - -static void -serial8250_pm(struct uart_port *port, unsigned int state, - unsigned int oldstate) -{ - struct uart_8250_port *up = (struct uart_8250_port *)port; - if (state) { - /* sleep */ - if (up->pm) - up->pm(port, state, oldstate); - } else { - /* wake */ - if (up->pm) - up->pm(port, state, oldstate); - } -} - -/* - * Resource handling. This is complicated by the fact that resources - * depend on the port type. Maybe we should be claiming the standard - * 8250 ports, and then trying to get other resources as necessary? - */ -static int -serial8250_request_std_resource(struct uart_8250_port *up, struct resource **res) -{ - unsigned int size = 8 << up->port.regshift; - int ret = 0; - - switch (up->port.iotype) { - case UPIO_MEM: - if (up->port.mapbase) { - *res = request_mem_region(up->port.mapbase, size, "serial"); - if (!*res) - ret = -EBUSY; - } - break; - - case UPIO_HUB6: - case UPIO_PORT: - *res = request_region(up->port.iobase, size, "serial"); - if (!*res) - ret = -EBUSY; - break; - } - return ret; -} - - -static void serial8250_release_port(struct uart_port *port) -{ - struct uart_8250_port *up = (struct uart_8250_port *)port; - unsigned long start, offset = 0, size = 0; - - size <<= up->port.regshift; - - switch (up->port.iotype) { - case UPIO_MEM: - if (up->port.mapbase) { - /* - * Unmap the area. - */ - iounmap(up->port.membase); - up->port.membase = NULL; - - start = up->port.mapbase; - - if (size) - release_mem_region(start + offset, size); - release_mem_region(start, 8 << up->port.regshift); - } - break; - - case UPIO_HUB6: - case UPIO_PORT: - start = up->port.iobase; - - if (size) - release_region(start + offset, size); - release_region(start + offset, 8 << up->port.regshift); - break; - - default: - break; - } -} - -static int serial8250_request_port(struct uart_port *port) -{ - struct uart_8250_port *up = (struct uart_8250_port *)port; - struct resource *res = NULL, *res_rsa = NULL; - int ret = 0; - - ret = serial8250_request_std_resource(up, &res); - - /* - * If we have a mapbase, then request that as well. - */ - if (ret == 0 && up->port.flags & UPF_IOREMAP) { - int size = res->end - res->start + 1; - - up->port.membase = ioremap(up->port.mapbase, size); - if (!up->port.membase) - ret = -ENOMEM; - } - - if (ret < 0) { - if (res_rsa) - release_resource(res_rsa); - if (res) - release_resource(res); - } - return ret; -} - -static void serial8250_config_port(struct uart_port *port, int flags) -{ - struct uart_8250_port *up = (struct uart_8250_port *)port; - struct resource *res_std = NULL, *res_rsa = NULL; - int probeflags = PROBE_ANY; - - probeflags &= ~PROBE_RSA; - - if (flags & UART_CONFIG_TYPE) - autoconfig(up, probeflags); - - /* - * If the port wasn't an RSA port, release the resource. - */ - if (up->port.type != PORT_RSA && res_rsa) - release_resource(res_rsa); - - if (up->port.type == PORT_UNKNOWN && res_std) - release_resource(res_std); -} - -static int -serial8250_verify_port(struct uart_port *port, struct serial_struct *ser) -{ - if (ser->irq >= NR_IRQS || ser->irq < 0 || - ser->baud_base < 9600 || ser->type < PORT_UNKNOWN || - ser->type > PORT_MAX_8250 || ser->type == PORT_CIRRUS || - ser->type == PORT_STARTECH) - return -EINVAL; - return 0; -} - -static const char * -serial8250_type(struct uart_port *port) -{ - int type = port->type; - - if (type >= ARRAY_SIZE(uart_config)) - type = 0; - return uart_config[type].name; -} - -static struct uart_ops serial8250_pops = { - .tx_empty = serial8250_tx_empty, - .set_mctrl = serial8250_set_mctrl, - .get_mctrl = serial8250_get_mctrl, - .stop_tx = serial8250_stop_tx, - .start_tx = serial8250_start_tx, - .stop_rx = serial8250_stop_rx, - .enable_ms = serial8250_enable_ms, - .break_ctl = serial8250_break_ctl, - .startup = serial8250_startup, - .shutdown = serial8250_shutdown, - .set_termios = serial8250_set_termios, - .pm = serial8250_pm, - .type = serial8250_type, - .release_port = serial8250_release_port, - .request_port = serial8250_request_port, - .config_port = serial8250_config_port, - .verify_port = serial8250_verify_port, -}; - -static struct uart_8250_port serial8250_ports[UART_NR]; - -static void __init serial8250_isa_init_ports(void) -{ - struct uart_8250_port *up; - static int first = 1; - int i; - - if (!first) - return; - first = 0; - - for (i = 0, up = serial8250_ports; i < ARRAY_SIZE(old_serial_port); - i++, up++) { - up->port.iobase = old_serial_port[i].port; - up->port.irq = old_serial_port[i].irq; - up->port.uartclk = get_au1x00_uart_baud_base(); - up->port.flags = old_serial_port[i].flags; - up->port.hub6 = old_serial_port[i].hub6; - up->port.membase = old_serial_port[i].iomem_base; - up->port.iotype = old_serial_port[i].io_type; - up->port.regshift = old_serial_port[i].iomem_reg_shift; - up->port.ops = &serial8250_pops; - } -} - -static void __init serial8250_register_ports(struct uart_driver *drv) -{ - int i; - - serial8250_isa_init_ports(); - - for (i = 0; i < UART_NR; i++) { - struct uart_8250_port *up = &serial8250_ports[i]; - - up->port.line = i; - up->port.ops = &serial8250_pops; - init_timer(&up->timer); - up->timer.function = serial8250_timeout; - - /* - * ALPHA_KLUDGE_MCR needs to be killed. - */ - up->mcr_mask = ~ALPHA_KLUDGE_MCR; - up->mcr_force = ALPHA_KLUDGE_MCR; - - uart_add_one_port(drv, &up->port); - } -} - -#ifdef CONFIG_SERIAL_AU1X00_CONSOLE - -#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE) - -/* - * Wait for transmitter & holding register to empty - */ -static inline void wait_for_xmitr(struct uart_8250_port *up) -{ - unsigned int status, tmout = 10000; - - /* Wait up to 10ms for the character(s) to be sent. */ - do { - status = serial_in(up, UART_LSR); - - if (status & UART_LSR_BI) - up->lsr_break_flag = UART_LSR_BI; - - if (--tmout == 0) - break; - udelay(1); - } while ((status & BOTH_EMPTY) != BOTH_EMPTY); - - /* Wait up to 1s for flow control if necessary */ - if (up->port.flags & UPF_CONS_FLOW) { - tmout = 1000000; - while (--tmout && - ((serial_in(up, UART_MSR) & UART_MSR_CTS) == 0)) - udelay(1); - } -} - -static void au1x00_console_putchar(struct uart_port *port, int ch) -{ - struct uart_8250_port *up = (struct uart_8250_port *)port; - - wait_for_xmitr(up); - serial_out(up, UART_TX, ch); -} - -/* - * Print a string to the serial port trying not to disturb - * any possible real use of the port... - * - * The console_lock must be held when we get here. - */ -static void -serial8250_console_write(struct console *co, const char *s, unsigned int count) -{ - struct uart_8250_port *up = &serial8250_ports[co->index]; - unsigned int ier; - - /* - * First save the UER then disable the interrupts - */ - ier = serial_in(up, UART_IER); - serial_out(up, UART_IER, 0); - - uart_console_write(&up->port, s, count, au1x00_console_putchar); - - /* - * Finally, wait for transmitter to become empty - * and restore the IER - */ - wait_for_xmitr(up); - serial_out(up, UART_IER, ier); -} - -static int __init serial8250_console_setup(struct console *co, char *options) -{ - struct uart_port *port; - int baud = 9600; - int bits = 8; - int parity = 'n'; - int flow = 'n'; - - /* - * Check whether an invalid uart number has been specified, and - * if so, search for the first available port that does have - * console support. - */ - if (co->index >= UART_NR) - co->index = 0; - port = &serial8250_ports[co->index].port; - - /* - * Temporary fix. - */ - spin_lock_init(&port->lock); - - if (options) - uart_parse_options(options, &baud, &parity, &bits, &flow); - - return uart_set_options(port, co, baud, parity, bits, flow); -} - -extern struct uart_driver serial8250_reg; -static struct console serial8250_console = { - .name = "ttyS", - .write = serial8250_console_write, - .device = uart_console_device, - .setup = serial8250_console_setup, - .flags = CON_PRINTBUFFER, - .index = -1, - .data = &serial8250_reg, -}; - -static int __init serial8250_console_init(void) -{ - serial8250_isa_init_ports(); - register_console(&serial8250_console); - return 0; -} -console_initcall(serial8250_console_init); - -#define SERIAL8250_CONSOLE &serial8250_console -#else -#define SERIAL8250_CONSOLE NULL -#endif - -static struct uart_driver serial8250_reg = { - .owner = THIS_MODULE, - .driver_name = "serial", - .devfs_name = "tts/", - .dev_name = "ttyS", - .major = TTY_MAJOR, - .minor = 64, - .nr = UART_NR, - .cons = SERIAL8250_CONSOLE, -}; - -int __init early_serial_setup(struct uart_port *port) -{ - serial8250_isa_init_ports(); - serial8250_ports[port->line].port = *port; - serial8250_ports[port->line].port.ops = &serial8250_pops; - return 0; -} - -/** - * serial8250_suspend_port - suspend one serial port - * @line: serial line number - * @level: the level of port suspension, as per uart_suspend_port - * - * Suspend one serial port. - */ -void serial8250_suspend_port(int line) -{ - uart_suspend_port(&serial8250_reg, &serial8250_ports[line].port); -} - -/** - * serial8250_resume_port - resume one serial port - * @line: serial line number - * @level: the level of port resumption, as per uart_resume_port - * - * Resume one serial port. - */ -void serial8250_resume_port(int line) -{ - uart_resume_port(&serial8250_reg, &serial8250_ports[line].port); -} - -static int __init serial8250_init(void) -{ - int ret, i; - - printk(KERN_INFO "Serial: Au1x00 driver\n"); - - for (i = 0; i < NR_IRQS; i++) - spin_lock_init(&irq_lists[i].lock); - - ret = uart_register_driver(&serial8250_reg); - if (ret >= 0) - serial8250_register_ports(&serial8250_reg); - - return ret; -} - -static void __exit serial8250_exit(void) -{ - int i; - - for (i = 0; i < UART_NR; i++) - uart_remove_one_port(&serial8250_reg, &serial8250_ports[i].port); - - uart_unregister_driver(&serial8250_reg); -} - -module_init(serial8250_init); -module_exit(serial8250_exit); - -EXPORT_SYMBOL(serial8250_suspend_port); -EXPORT_SYMBOL(serial8250_resume_port); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Au1x00 serial driver\n"); diff --git a/include/asm-mips/serial.h b/include/asm-mips/serial.h index e796d75f027e..7b2366412203 100644 --- a/include/asm-mips/serial.h +++ b/include/asm-mips/serial.h @@ -103,88 +103,6 @@ #define IVR_SERIAL_PORT_DEFNS #endif -#ifdef CONFIG_SERIAL_AU1X00 -#include -#ifdef CONFIG_SOC_AU1000 -#define AU1000_SERIAL_PORT_DEFNS \ - { .baud_base = 0, .port = UART0_ADDR, \ - .iomem_base = (unsigned char *)UART0_ADDR, \ - .irq = AU1000_UART0_INT, .flags = STD_COM_FLAGS, \ - .iomem_reg_shift = 2 }, \ - { .baud_base = 0, .port = UART1_ADDR, \ - .iomem_base = (unsigned char *)UART1_ADDR, \ - .irq = AU1000_UART1_INT, .flags = STD_COM_FLAGS, \ - .iomem_reg_shift = 2 }, \ - { .baud_base = 0, .port = UART2_ADDR, \ - .iomem_base = (unsigned char *)UART2_ADDR, \ - .irq = AU1000_UART2_INT, .flags = STD_COM_FLAGS, \ - .iomem_reg_shift = 2 }, \ - { .baud_base = 0, .port = UART3_ADDR, \ - .iomem_base = (unsigned char *)UART3_ADDR, \ - .irq = AU1000_UART3_INT, .flags = STD_COM_FLAGS, \ - .iomem_reg_shift = 2 }, -#endif - -#ifdef CONFIG_SOC_AU1500 -#define AU1000_SERIAL_PORT_DEFNS \ - { .baud_base = 0, .port = UART0_ADDR, \ - .iomem_base = (unsigned char *)UART0_ADDR, \ - .irq = AU1500_UART0_INT, .flags = STD_COM_FLAGS, \ - .iomem_reg_shift = 2 }, \ - { .baud_base = 0, .port = UART3_ADDR, \ - .iomem_base = (unsigned char *)UART3_ADDR, \ - .irq = AU1500_UART3_INT, .flags = STD_COM_FLAGS, \ - .iomem_reg_shift = 2 }, -#endif - -#ifdef CONFIG_SOC_AU1100 -#define AU1000_SERIAL_PORT_DEFNS \ - { .baud_base = 0, .port = UART0_ADDR, \ - .iomem_base = (unsigned char *)UART0_ADDR, \ - .irq = AU1100_UART0_INT, .flags = STD_COM_FLAGS, \ - .iomem_reg_shift = 2 }, \ - { .baud_base = 0, .port = UART1_ADDR, \ - .iomem_base = (unsigned char *)UART1_ADDR, \ - .irq = AU1100_UART1_INT, .flags = STD_COM_FLAGS, \ - .iomem_reg_shift = 2 }, \ - { .baud_base = 0, .port = UART3_ADDR, \ - .iomem_base = (unsigned char *)UART3_ADDR, \ - .irq = AU1100_UART3_INT, .flags = STD_COM_FLAGS, \ - .iomem_reg_shift = 2 }, -#endif - -#ifdef CONFIG_SOC_AU1550 -#define AU1000_SERIAL_PORT_DEFNS \ - { .baud_base = 0, .port = UART0_ADDR, \ - .iomem_base = (unsigned char *)UART0_ADDR, \ - .irq = AU1550_UART0_INT, .flags = STD_COM_FLAGS, \ - .iomem_reg_shift = 2 }, \ - { .baud_base = 0, .port = UART1_ADDR, \ - .iomem_base = (unsigned char *)UART1_ADDR, \ - .irq = AU1550_UART1_INT, .flags = STD_COM_FLAGS, \ - .iomem_reg_shift = 2 }, \ - { .baud_base = 0, .port = UART3_ADDR, \ - .iomem_base = (unsigned char *)UART3_ADDR, \ - .irq = AU1550_UART3_INT, .flags = STD_COM_FLAGS,\ - .iomem_reg_shift = 2 }, -#endif - -#ifdef CONFIG_SOC_AU1200 -#define AU1000_SERIAL_PORT_DEFNS \ - { .baud_base = 0, .port = UART0_ADDR, \ - .iomem_base = (unsigned char *)UART0_ADDR, \ - .irq = AU1200_UART0_INT, .flags = STD_COM_FLAGS, \ - .iomem_reg_shift = 2 }, \ - { .baud_base = 0, .port = UART1_ADDR, \ - .iomem_base = (unsigned char *)UART1_ADDR, \ - .irq = AU1200_UART1_INT, .flags = STD_COM_FLAGS, \ - .iomem_reg_shift = 2 }, -#endif - -#else -#define AU1000_SERIAL_PORT_DEFNS -#endif - #ifdef CONFIG_HAVE_STD_PC_SERIAL_PORT #define STD_SERIAL_PORT_DEFNS \ /* UART CLK PORT IRQ FLAGS */ \ @@ -331,7 +249,6 @@ MOMENCO_OCELOT_G_SERIAL_PORT_DEFNS \ MOMENCO_OCELOT_C_SERIAL_PORT_DEFNS \ MOMENCO_OCELOT_SERIAL_PORT_DEFNS \ - MOMENCO_OCELOT_3_SERIAL_PORT_DEFNS \ - AU1000_SERIAL_PORT_DEFNS + MOMENCO_OCELOT_3_SERIAL_PORT_DEFNS #endif /* _ASM_SERIAL_H */ From c9b4470e2d9d611181fe488fdf463948d8b24901 Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Sun, 26 Mar 2006 21:40:27 +0100 Subject: [PATCH 010/306] [ARM] 3414/1: ep93xx: reset ethernet controller before uncompressing Patch from Lennert Buytenhek The Redboot version that cirrus supplies for the cirrus ep93xx doesn't turn off DMA from the ethernet MAC before jumping to linux, which means that we might end up with bits of RX status and packet data scribbled over the uncompressed kernel image. Work around this by resetting the ethernet MAC before we uncompress. We don't usually work around bootloader bugs, but considering that the large majority of ep93xx boards out there have this problem, I figured this it was justified in this case. Signed-off-by: Lennert Buytenhek Signed-off-by: Russell King --- include/asm-arm/arch-ep93xx/uncompress.h | 40 +++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/include/asm-arm/arch-ep93xx/uncompress.h b/include/asm-arm/arch-ep93xx/uncompress.h index 4410d217077e..2171082d4fc5 100644 --- a/include/asm-arm/arch-ep93xx/uncompress.h +++ b/include/asm-arm/arch-ep93xx/uncompress.h @@ -16,11 +16,21 @@ static unsigned char __raw_readb(unsigned int ptr) return *((volatile unsigned char *)ptr); } +static unsigned int __raw_readl(unsigned int ptr) +{ + return *((volatile unsigned int *)ptr); +} + static void __raw_writeb(unsigned char value, unsigned int ptr) { *((volatile unsigned char *)ptr) = value; } +static void __raw_writel(unsigned int value, unsigned int ptr) +{ + *((volatile unsigned int *)ptr) = value; +} + #define PHYS_UART1_DATA 0x808c0000 #define PHYS_UART1_FLAG 0x808c0018 @@ -49,5 +59,33 @@ static void putstr(const char *s) } } -#define arch_decomp_setup() + +/* + * Some bootloaders don't turn off DMA from the ethernet MAC before + * jumping to linux, which means that we might end up with bits of RX + * status and packet data scribbled over the uncompressed kernel image. + * Work around this by resetting the ethernet MAC before we uncompress. + */ +#define PHYS_ETH_SELF_CTL 0x80010020 +#define ETH_SELF_CTL_RESET 0x00000001 + +static void ethernet_reset(void) +{ + unsigned int v; + + /* Reset the ethernet MAC. */ + v = __raw_readl(PHYS_ETH_SELF_CTL); + __raw_writel(v | ETH_SELF_CTL_RESET, PHYS_ETH_SELF_CTL); + + /* Wait for reset to finish. */ + while (__raw_readl(PHYS_ETH_SELF_CTL) & ETH_SELF_CTL_RESET) + ; +} + + +static void arch_decomp_setup(void) +{ + ethernet_reset(); +} + #define arch_decomp_wdog() From bd8f103efef14980decf5d54a2de85835e9a6d9a Mon Sep 17 00:00:00 2001 From: Richard Purdie Date: Sun, 26 Mar 2006 21:40:27 +0100 Subject: [PATCH 011/306] [ARM] 3415/1: Akita: Add missing EXPORT_SYMBOL Patch from Richard Purdie Add an EXPORT_SYMBOL for the Akita IO Expander Device. Signed-off-by: Richard Purdie Signed-off-by: Russell King --- arch/arm/mach-pxa/spitz.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c index 30ec317bbb97..0dbb079ecd25 100644 --- a/arch/arm/mach-pxa/spitz.c +++ b/arch/arm/mach-pxa/spitz.c @@ -467,6 +467,8 @@ struct platform_device akitaioexp_device = { .id = -1, }; +EXPORT_SYMBOL_GPL(akitaioexp_device); + static void __init akita_init(void) { spitz_ficp_platform_data.transceiver_mode = akita_irda_transceiver_mode; From fbb18a277a6f192404aa20ece49529acb1e1e76d Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 26 Mar 2006 23:13:39 +0100 Subject: [PATCH 012/306] [SERIAL] amba-pl010: allow platforms to specify modem control method The amba-pl010 hardware does not provide RTS and DTR control lines; it is expected that these will be implemented using GPIO. Allow platforms to supply a function to implement manipulation of modem control lines. Signed-off-by: Russell King --- arch/arm/mach-integrator/core.c | 46 +++++++++ drivers/serial/amba-pl010.c | 160 +++++++++++++------------------- include/linux/amba/serial.h | 6 ++ 3 files changed, 119 insertions(+), 93 deletions(-) diff --git a/arch/arm/mach-integrator/core.c b/arch/arm/mach-integrator/core.c index 20071a2767cc..576a5e979c00 100644 --- a/arch/arm/mach-integrator/core.c +++ b/arch/arm/mach-integrator/core.c @@ -15,7 +15,9 @@ #include #include #include +#include #include +#include #include #include @@ -28,6 +30,8 @@ #include "common.h" +static struct amba_pl010_data integrator_uart_data; + static struct amba_device rtc_device = { .dev = { .bus_id = "mb:15", @@ -44,6 +48,7 @@ static struct amba_device rtc_device = { static struct amba_device uart0_device = { .dev = { .bus_id = "mb:16", + .platform_data = &integrator_uart_data, }, .res = { .start = INTEGRATOR_UART0_BASE, @@ -57,6 +62,7 @@ static struct amba_device uart0_device = { static struct amba_device uart1_device = { .dev = { .bus_id = "mb:17", + .platform_data = &integrator_uart_data, }, .res = { .start = INTEGRATOR_UART1_BASE, @@ -115,6 +121,46 @@ static int __init integrator_init(void) arch_initcall(integrator_init); +/* + * On the Integrator platform, the port RTS and DTR are provided by + * bits in the following SC_CTRLS register bits: + * RTS DTR + * UART0 7 6 + * UART1 5 4 + */ +#define SC_CTRLC (IO_ADDRESS(INTEGRATOR_SC_BASE) + INTEGRATOR_SC_CTRLC_OFFSET) +#define SC_CTRLS (IO_ADDRESS(INTEGRATOR_SC_BASE) + INTEGRATOR_SC_CTRLS_OFFSET) + +static void integrator_uart_set_mctrl(struct amba_device *dev, void __iomem *base, unsigned int mctrl) +{ + unsigned int ctrls = 0, ctrlc = 0, rts_mask, dtr_mask; + + if (dev == &uart0_device) { + rts_mask = 1 << 4; + dtr_mask = 1 << 5; + } else { + rts_mask = 1 << 6; + dtr_mask = 1 << 7; + } + + if (mctrl & TIOCM_RTS) + ctrlc |= rts_mask; + else + ctrls |= rts_mask; + + if (mctrl & TIOCM_DTR) + ctrlc |= dtr_mask; + else + ctrls |= dtr_mask; + + __raw_writel(ctrls, SC_CTRLS); + __raw_writel(ctrlc, SC_CTRLC); +} + +static struct amba_pl010_data integrator_uart_data = { + .set_mctrl = integrator_uart_set_mctrl, +}; + #define CM_CTRL IO_ADDRESS(INTEGRATOR_HDR_BASE) + INTEGRATOR_HDR_CTRL_OFFSET static DEFINE_SPINLOCK(cm_lock); diff --git a/drivers/serial/amba-pl010.c b/drivers/serial/amba-pl010.c index 127d6cd5de7f..1631414000a2 100644 --- a/drivers/serial/amba-pl010.c +++ b/drivers/serial/amba-pl010.c @@ -51,8 +51,6 @@ #include #include -#include -#include #define UART_NR 2 @@ -65,26 +63,16 @@ #define UART_RX_DATA(s) (((s) & UART01x_FR_RXFE) == 0) #define UART_TX_READY(s) (((s) & UART01x_FR_TXFF) == 0) -#define UART_DUMMY_RSR_RX /*256*/0 +#define UART_DUMMY_RSR_RX 256 #define UART_PORT_SIZE 64 -/* - * On the Integrator platform, the port RTS and DTR are provided by - * bits in the following SC_CTRLS register bits: - * RTS DTR - * UART0 7 6 - * UART1 5 4 - */ -#define SC_CTRLC (IO_ADDRESS(INTEGRATOR_SC_BASE) + INTEGRATOR_SC_CTRLC_OFFSET) -#define SC_CTRLS (IO_ADDRESS(INTEGRATOR_SC_BASE) + INTEGRATOR_SC_CTRLS_OFFSET) - /* * We wrap our port structure around the generic uart_port. */ struct uart_amba_port { struct uart_port port; - unsigned int dtr_mask; - unsigned int rts_mask; + struct amba_device *dev; + struct amba_pl010_data *data; unsigned int old_status; }; @@ -300,20 +288,9 @@ static unsigned int pl010_get_mctrl(struct uart_port *port) static void pl010_set_mctrl(struct uart_port *port, unsigned int mctrl) { struct uart_amba_port *uap = (struct uart_amba_port *)port; - unsigned int ctrls = 0, ctrlc = 0; - if (mctrl & TIOCM_RTS) - ctrlc |= uap->rts_mask; - else - ctrls |= uap->rts_mask; - - if (mctrl & TIOCM_DTR) - ctrlc |= uap->dtr_mask; - else - ctrls |= uap->dtr_mask; - - __raw_writel(ctrls, SC_CTRLS); - __raw_writel(ctrlc, SC_CTRLC); + if (uap->data) + uap->data->set_mctrl(uap->dev, uap->port.membase, mctrl); } static void pl010_break_ctl(struct uart_port *port, int break_state) @@ -539,38 +516,7 @@ static struct uart_ops amba_pl010_pops = { .verify_port = pl010_verify_port, }; -static struct uart_amba_port amba_ports[UART_NR] = { - { - .port = { - .membase = (void *)IO_ADDRESS(INTEGRATOR_UART0_BASE), - .mapbase = INTEGRATOR_UART0_BASE, - .iotype = UPIO_MEM, - .irq = IRQ_UARTINT0, - .uartclk = 14745600, - .fifosize = 16, - .ops = &amba_pl010_pops, - .flags = UPF_BOOT_AUTOCONF, - .line = 0, - }, - .dtr_mask = 1 << 5, - .rts_mask = 1 << 4, - }, - { - .port = { - .membase = (void *)IO_ADDRESS(INTEGRATOR_UART1_BASE), - .mapbase = INTEGRATOR_UART1_BASE, - .iotype = UPIO_MEM, - .irq = IRQ_UARTINT1, - .uartclk = 14745600, - .fifosize = 16, - .ops = &amba_pl010_pops, - .flags = UPF_BOOT_AUTOCONF, - .line = 1, - }, - .dtr_mask = 1 << 7, - .rts_mask = 1 << 6, - } -}; +static struct uart_amba_port *amba_ports[UART_NR]; #ifdef CONFIG_SERIAL_AMBA_PL010_CONSOLE @@ -588,7 +534,7 @@ static void pl010_console_putchar(struct uart_port *port, int ch) static void pl010_console_write(struct console *co, const char *s, unsigned int count) { - struct uart_port *port = &amba_ports[co->index].port; + struct uart_port *port = &amba_ports[co->index]->port; unsigned int status, old_cr; /* @@ -651,7 +597,7 @@ static int __init pl010_console_setup(struct console *co, char *options) */ if (co->index >= UART_NR) co->index = 0; - port = &amba_ports[co->index].port; + port = &amba_ports[co->index]->port; if (options) uart_parse_options(options, &baud, &parity, &bits, &flow); @@ -672,24 +618,6 @@ static struct console amba_console = { .data = &amba_reg, }; -static int __init amba_console_init(void) -{ - /* - * All port initializations are done statically - */ - register_console(&amba_console); - return 0; -} -console_initcall(amba_console_init); - -static int __init amba_late_console_init(void) -{ - if (!(amba_console.flags & CON_ENABLED)) - register_console(&amba_console); - return 0; -} -late_initcall(amba_late_console_init); - #define AMBA_CONSOLE &amba_console #else #define AMBA_CONSOLE NULL @@ -707,30 +635,76 @@ static struct uart_driver amba_reg = { static int pl010_probe(struct amba_device *dev, void *id) { - int i; + struct uart_amba_port *port; + void __iomem *base; + int i, ret; - for (i = 0; i < UART_NR; i++) { - if (amba_ports[i].port.mapbase != dev->res.start) - continue; + for (i = 0; i < ARRAY_SIZE(amba_ports); i++) + if (amba_ports[i] == NULL) + break; - amba_ports[i].port.dev = &dev->dev; - uart_add_one_port(&amba_reg, &amba_ports[i].port); - amba_set_drvdata(dev, &amba_ports[i]); - break; + if (i == ARRAY_SIZE(amba_ports)) { + ret = -EBUSY; + goto out; } - return 0; + port = kzalloc(sizeof(struct uart_amba_port), GFP_KERNEL); + if (!port) { + ret = -ENOMEM; + goto out; + } + + base = ioremap(dev->res.start, PAGE_SIZE); + if (!base) { + ret = -ENOMEM; + goto free; + } + + port->port.dev = &dev->dev; + port->port.mapbase = dev->res.start; + port->port.membase = base; + port->port.iotype = UPIO_MEM; + port->port.irq = dev->irq[0]; + port->port.uartclk = 14745600; + port->port.fifosize = 16; + port->port.ops = &amba_pl010_pops; + port->port.flags = UPF_BOOT_AUTOCONF; + port->port.line = i; + port->dev = dev; + port->data = dev->dev.platform_data; + + amba_ports[i] = port; + + amba_set_drvdata(dev, port); + ret = uart_add_one_port(&amba_reg, &port->port); + if (ret) { + amba_set_drvdata(dev, NULL); + amba_ports[i] = NULL; + iounmap(base); + free: + kfree(port); + } + + out: + return ret; } static int pl010_remove(struct amba_device *dev) { - struct uart_amba_port *uap = amba_get_drvdata(dev); - - if (uap) - uart_remove_one_port(&amba_reg, &uap->port); + struct uart_amba_port *port = amba_get_drvdata(dev); + int i; amba_set_drvdata(dev, NULL); + uart_remove_one_port(&amba_reg, &port->port); + + for (i = 0; i < ARRAY_SIZE(amba_ports); i++) + if (amba_ports[i] == port) + amba_ports[i] = NULL; + + iounmap(port->port.membase); + kfree(port); + return 0; } diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h index dc726ffccebd..48ee32a18ac5 100644 --- a/include/linux/amba/serial.h +++ b/include/linux/amba/serial.h @@ -158,4 +158,10 @@ #define UART01x_RSR_ANY (UART01x_RSR_OE|UART01x_RSR_BE|UART01x_RSR_PE|UART01x_RSR_FE) #define UART01x_FR_MODEM_ANY (UART01x_FR_DCD|UART01x_FR_DSR|UART01x_FR_CTS) +#ifndef __ASSEMBLY__ +struct amba_pl010_data { + void (*set_mctrl)(struct amba_device *dev, void __iomem *base, unsigned int mctrl); +}; +#endif + #endif From aee85fe8e8143d3f54d9e6d3c6cdd40ead563267 Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Sun, 26 Mar 2006 23:16:39 +0100 Subject: [PATCH 013/306] [SERIAL] Provide Cirrus EP93xx AMBA PL010 serial support. Signed-off-by: Lennert Buytenhek Signed-off-by: Russell King --- arch/arm/mach-ep93xx/core.c | 68 +++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/arch/arm/mach-ep93xx/core.c b/arch/arm/mach-ep93xx/core.c index 865427bfad7e..2d892e4daa07 100644 --- a/arch/arm/mach-ep93xx/core.c +++ b/arch/arm/mach-ep93xx/core.c @@ -30,7 +30,9 @@ #include #include #include +#include #include +#include #include #include @@ -360,6 +362,68 @@ void __init ep93xx_init_irq(void) /************************************************************************* * EP93xx peripheral handling *************************************************************************/ +#define EP93XX_UART_MCR_OFFSET (0x0100) + +static void ep93xx_uart_set_mctrl(struct amba_device *dev, + void __iomem *base, unsigned int mctrl) +{ + unsigned int mcr; + + mcr = 0; + if (!(mctrl & TIOCM_RTS)) + mcr |= 2; + if (!(mctrl & TIOCM_DTR)) + mcr |= 1; + + __raw_writel(mcr, base + EP93XX_UART_MCR_OFFSET); +} + +static struct amba_pl010_data ep93xx_uart_data = { + .set_mctrl = ep93xx_uart_set_mctrl, +}; + +static struct amba_device uart1_device = { + .dev = { + .bus_id = "apb:uart1", + .platform_data = &ep93xx_uart_data, + }, + .res = { + .start = EP93XX_UART1_PHYS_BASE, + .end = EP93XX_UART1_PHYS_BASE + 0x0fff, + .flags = IORESOURCE_MEM, + }, + .irq = { IRQ_EP93XX_UART1, NO_IRQ }, + .periphid = 0x00041010, +}; + +static struct amba_device uart2_device = { + .dev = { + .bus_id = "apb:uart2", + .platform_data = &ep93xx_uart_data, + }, + .res = { + .start = EP93XX_UART2_PHYS_BASE, + .end = EP93XX_UART2_PHYS_BASE + 0x0fff, + .flags = IORESOURCE_MEM, + }, + .irq = { IRQ_EP93XX_UART2, NO_IRQ }, + .periphid = 0x00041010, +}; + +static struct amba_device uart3_device = { + .dev = { + .bus_id = "apb:uart3", + .platform_data = &ep93xx_uart_data, + }, + .res = { + .start = EP93XX_UART3_PHYS_BASE, + .end = EP93XX_UART3_PHYS_BASE + 0x0fff, + .flags = IORESOURCE_MEM, + }, + .irq = { IRQ_EP93XX_UART3, NO_IRQ }, + .periphid = 0x00041010, +}; + void __init ep93xx_init_devices(void) { unsigned int v; @@ -371,4 +435,8 @@ void __init ep93xx_init_devices(void) v &= ~EP93XX_SYSCON_DEVICE_CONFIG_CRUNCH_ENABLE; __raw_writel(0xaa, EP93XX_SYSCON_SWLOCK); __raw_writel(v, EP93XX_SYSCON_DEVICE_CONFIG); + + amba_device_register(&uart1_device, &iomem_resource); + amba_device_register(&uart2_device, &iomem_resource); + amba_device_register(&uart3_device, &iomem_resource); } From 6abaaaae6d5ed52422c8caf65f3cdbb95579bb58 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 26 Mar 2006 17:37:54 -0800 Subject: [PATCH 014/306] [IPSEC]: Fix tunnel error handling in ipcomp6 The error handling in ipcomp6_tunnel_create is broken in two ways: 1) If we fail to allocate an SPI (this should never happen in practice since there are plenty of 32-bit SPI values for us to use), we will still go ahead and create the SA. 2) When xfrm_init_state fails, we first of all may trigger the BUG_TRAP in __xfrm_state_destroy because we didn't set the state to DEAD. More importantly we end up returning the freed state as if we succeeded! This patch fixes them both. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv6/ipcomp6.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 028b636687ec..d4cfec3f414e 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -228,6 +228,9 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) t->id.proto = IPPROTO_IPV6; t->id.spi = xfrm6_tunnel_alloc_spi((xfrm_address_t *)&x->props.saddr); + if (!t->id.spi) + goto error; + memcpy(t->id.daddr.a6, x->id.daddr.a6, sizeof(struct in6_addr)); memcpy(&t->sel, &x->sel, sizeof(t->sel)); t->props.family = AF_INET6; @@ -243,7 +246,9 @@ out: return t; error: + t->km.state = XFRM_STATE_DEAD; xfrm_state_put(t); + t = NULL; goto out; } From 3eb4801d7bde42b82f05137392a1ee0ece090bad Mon Sep 17 00:00:00 2001 From: Norbert Kiesel Date: Sun, 26 Mar 2006 17:39:55 -0800 Subject: [PATCH 015/306] [NET]: drop duplicate assignment in request_sock Just noticed that request_sock.[ch] contain a useless assignment of rskq_accept_head to itself. I assume this is a typo and the 2nd one was supposed to be _tail. However, setting _tail to NULL is not needed, so the patch below just drops the 2nd assignment. Signed-off-By: Norbert Kiesel Signed-off-by: Adrian Bunk Signed-off-by: David S. Miller --- include/net/request_sock.h | 2 +- net/core/request_sock.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 11641c9384f7..c5d7f920c352 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -145,7 +145,7 @@ static inline struct request_sock * { struct request_sock *req = queue->rskq_accept_head; - queue->rskq_accept_head = queue->rskq_accept_head = NULL; + queue->rskq_accept_head = NULL; return req; } diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 98f0fc923f91..1e44eda1fda9 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -51,7 +51,7 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); rwlock_init(&queue->syn_wait_lock); - queue->rskq_accept_head = queue->rskq_accept_head = NULL; + queue->rskq_accept_head = NULL; lopt->nr_table_entries = nr_table_entries; write_lock_bh(&queue->syn_wait_lock); From 28832e83379afd0b0e83b78ac317290c79ebd496 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Wed, 8 Mar 2006 11:19:51 +0100 Subject: [PATCH 016/306] [PATCH] update max_sectors documentation The max_sectors has been split into max_hw_sectors and max_sectors for some time. A patch to have blk_queue_max_sectors enforce this was sent by me and it broke IDE. This patch updates the documentation. Signed-off-by: Jens Axboe --- Documentation/block/biodoc.txt | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 8e63831971d5..f989a9e839b4 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -132,8 +132,18 @@ Some new queue property settings: limit. No highmem default. blk_queue_max_sectors(q, max_sectors) - Maximum size request you can handle in units of 512 byte - sectors. 255 default. + Sets two variables that limit the size of the request. + + - The request queue's max_sectors, which is a soft size in + in units of 512 byte sectors, and could be dynamically varied + by the core kernel. + + - The request queue's max_hw_sectors, which is a hard limit + and reflects the maximum size request a driver can handle + in units of 512 byte sectors. + + The default for both max_sectors and max_hw_sectors is + 255. The upper limit of max_sectors is 1024. blk_queue_max_phys_segments(q, max_segments) Maximum physical segments you can handle in a request. 128 From 06ff37ffb4ba8bcbda0e9d19c712c954ef7b8a0a Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Wed, 8 Mar 2006 11:21:52 +0100 Subject: [PATCH 017/306] [PATCH] kzalloc() conversion in drivers/block this patch converts drivers/block to kzalloc usage. Compile tested with allyesconfig. Signed-off-by: Eric Sesterhenn Signed-off-by: Jens Axboe --- drivers/block/DAC960.c | 7 ++----- drivers/block/cciss.c | 10 +++------- drivers/block/cciss_scsi.c | 3 +-- drivers/block/paride/bpck6.c | 3 +-- 4 files changed, 7 insertions(+), 16 deletions(-) diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 9bdea2a5cf0e..49c7cd558ddf 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -311,11 +311,10 @@ static boolean DAC960_CreateAuxiliaryStructures(DAC960_Controller_T *Controller) CommandsRemaining = CommandAllocationGroupSize; CommandGroupByteCount = CommandsRemaining * CommandAllocationLength; - AllocationPointer = kmalloc(CommandGroupByteCount, GFP_ATOMIC); + AllocationPointer = kzalloc(CommandGroupByteCount, GFP_ATOMIC); if (AllocationPointer == NULL) return DAC960_Failure(Controller, "AUXILIARY STRUCTURE CREATION"); - memset(AllocationPointer, 0, CommandGroupByteCount); } Command = (DAC960_Command_T *) AllocationPointer; AllocationPointer += CommandAllocationLength; @@ -2709,14 +2708,12 @@ DAC960_DetectController(struct pci_dev *PCI_Device, void __iomem *BaseAddress; int i; - Controller = (DAC960_Controller_T *) - kmalloc(sizeof(DAC960_Controller_T), GFP_ATOMIC); + Controller = kzalloc(sizeof(DAC960_Controller_T), GFP_ATOMIC); if (Controller == NULL) { DAC960_Error("Unable to allocate Controller structure for " "Controller at\n", NULL); return NULL; } - memset(Controller, 0, sizeof(DAC960_Controller_T)); Controller->ControllerNumber = DAC960_ControllerCount; DAC960_Controllers[DAC960_ControllerCount++] = Controller; Controller->Bus = PCI_Device->bus->number; diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 71ec9e664383..168da0c9ed3d 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -996,13 +996,11 @@ static int cciss_ioctl(struct inode *inode, struct file *filep, status = -EINVAL; goto cleanup1; } - buff = (unsigned char **) kmalloc(MAXSGENTRIES * - sizeof(char *), GFP_KERNEL); + buff = kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL); if (!buff) { status = -ENOMEM; goto cleanup1; } - memset(buff, 0, MAXSGENTRIES); buff_size = (int *) kmalloc(MAXSGENTRIES * sizeof(int), GFP_KERNEL); if (!buff_size) { @@ -2940,13 +2938,12 @@ static void cciss_getgeometry(int cntl_num) int block_size; int total_size; - ld_buff = kmalloc(sizeof(ReportLunData_struct), GFP_KERNEL); + ld_buff = kzalloc(sizeof(ReportLunData_struct), GFP_KERNEL); if (ld_buff == NULL) { printk(KERN_ERR "cciss: out of memory\n"); return; } - memset(ld_buff, 0, sizeof(ReportLunData_struct)); size_buff = kmalloc(sizeof( ReadCapdata_struct), GFP_KERNEL); if (size_buff == NULL) { @@ -3060,10 +3057,9 @@ static int alloc_cciss_hba(void) for(i=0; i< MAX_CTLR; i++) { if (!hba[i]) { ctlr_info_t *p; - p = kmalloc(sizeof(ctlr_info_t), GFP_KERNEL); + p = kzalloc(sizeof(ctlr_info_t), GFP_KERNEL); if (!p) goto Enomem; - memset(p, 0, sizeof(ctlr_info_t)); for (n = 0; n < NWD; n++) p->gendisk[n] = disk[n]; hba[i] = p; diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c index 0e66e904bd8c..597c007fe81b 100644 --- a/drivers/block/cciss_scsi.c +++ b/drivers/block/cciss_scsi.c @@ -1027,12 +1027,11 @@ cciss_update_non_disk_devices(int cntl_num, int hostno) int i; c = (ctlr_info_t *) hba[cntl_num]; - ld_buff = kmalloc(reportlunsize, GFP_KERNEL); + ld_buff = kzalloc(reportlunsize, GFP_KERNEL); if (ld_buff == NULL) { printk(KERN_ERR "cciss: out of memory\n"); return; } - memset(ld_buff, 0, reportlunsize); inq_buff = kmalloc(OBDR_TAPE_INQ_SIZE, GFP_KERNEL); if (inq_buff == NULL) { printk(KERN_ERR "cciss: out of memory\n"); diff --git a/drivers/block/paride/bpck6.c b/drivers/block/paride/bpck6.c index 08d858ad64db..41a237c5957d 100644 --- a/drivers/block/paride/bpck6.c +++ b/drivers/block/paride/bpck6.c @@ -224,10 +224,9 @@ static void bpck6_log_adapter( PIA *pi, char * scratch, int verbose ) static int bpck6_init_proto(PIA *pi) { - Interface *p = kmalloc(sizeof(Interface), GFP_KERNEL); + Interface *p = kzalloc(sizeof(Interface), GFP_KERNEL); if (p) { - memset(p, 0, sizeof(Interface)); pi->private = (unsigned long)p; return 0; } From f68110fc28859f5d7231d5c4fb6dbe68b1394c9b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Mar 2006 13:31:44 +0100 Subject: [PATCH 018/306] [BLOCK] ll_rw_blk: kmalloc -> kzalloc conversion Signed-off-by: Jens Axboe --- block/ll_rw_blk.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c index 062067fa7ead..b836b43113da 100644 --- a/block/ll_rw_blk.c +++ b/block/ll_rw_blk.c @@ -906,17 +906,15 @@ init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth) __FUNCTION__, depth); } - tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC); + tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC); if (!tag_index) goto fail; nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG; - tag_map = kmalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC); + tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC); if (!tag_map) goto fail; - memset(tag_index, 0, depth * sizeof(struct request *)); - memset(tag_map, 0, nr_ulongs * sizeof(unsigned long)); tags->real_max_depth = depth; tags->max_depth = depth; tags->tag_index = tag_index; From b8fca1c7682105c843319728d8e37b42b19092bb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 21 Mar 2006 15:24:37 +0100 Subject: [PATCH 019/306] [PATCH] ide-cd: quiet down GPCMD_READ_CDVD_CAPACITY failure Some drives like to throw a: ATAPI device hdc: Error: Not ready -- (Sense key=0x02) Incompatible medium installed -- (asc=0x30, ascq=0x00) The failed "Read Cd/Dvd Capacity" packet command was: "25 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 " warning on incompatible media, so quiet down this error. Signed-off-by: Jens Axboe --- drivers/ide/ide-cd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index c7671e188017..b4a41d6d0714 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -2143,6 +2143,7 @@ static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity, req.cmd[0] = GPCMD_READ_CDVD_CAPACITY; req.data = (char *)&capbuf; req.data_len = sizeof(capbuf); + req.flags |= REQ_QUIET; stat = cdrom_queue_packet_command(drive, &req); if (stat == 0) { From 4c5d0bbde9669cfb7f7fd4670dc9a117aea90384 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 22 Mar 2006 08:08:01 +0100 Subject: [PATCH 020/306] [PATCH] blk_execute_rq_nowait-speedup Both elv_add_request() and generic_unplug_device() grab the queue lock and disable interrupts, do that locally and use the __ variants. Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- block/ll_rw_blk.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c index b836b43113da..7fc903b5f3cd 100644 --- a/block/ll_rw_blk.c +++ b/block/ll_rw_blk.c @@ -2477,10 +2477,12 @@ void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk, rq->rq_disk = bd_disk; rq->flags |= REQ_NOMERGE; rq->end_io = done; - elv_add_request(q, rq, where, 1); - generic_unplug_device(q); + WARN_ON(irqs_disabled()); + spin_lock_irq(q->queue_lock); + __elv_add_request(q, rq, where, 1); + __generic_unplug_device(q); + spin_unlock_irq(q->queue_lock); } - EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); /** From 837c7878771c15ed8d85ecf814ece7fcb4551b46 Mon Sep 17 00:00:00 2001 From: Ben Woodard Date: Wed, 22 Mar 2006 08:09:31 +0100 Subject: [PATCH 021/306] [BLOCK] increase size of disk stat counters The kernel's representation of the disk statistics uses the type unsigned which is 32b on both 32b and 64b platforms. Unfortunately, most system tools that work with these numbers that are exported in /proc/diskstats including iostat read these numbers into unsigned longs. This works fine on 32b platforms and when the number of IO transactions are small on 64b platforms. However, when the numbers wrap on 64b platforms & you read the numbers into unsigned longs, and compare the numbers to previous readings, then you get an unsigned representation of a negative number. This looks like a very large 64b number & gives you bizarre readouts in iostat: ilc4: Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s rkB/s wkB/s avgrq-sz avgqu-sz await svctm %util ilc4: sda 5.50 0.00 143.96 0.00 307496983987862656.00 0.00 153748491993931328.00 0.00 2136028725038430.00 7.94 55.12 5.59 80.42 Though fixing iostat in user space is possible, and a quick survey indicates that several other similar tools also use unsigned longs when processing /proc/diskstats. Therefore, it seems like a better approach would be to extend the length of the disk_stats structure on 64b architectures to 64b. The following patch does that. It should not affect the operation on 32b platforms. Signed-off-by: Ben Woodard Cc: Rick Lindsley Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- block/genhd.c | 6 +++--- include/linux/genhd.h | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 64510fd88621..db4c60c802d6 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -454,8 +454,8 @@ static ssize_t disk_stats_read(struct gendisk * disk, char *page) disk_round_stats(disk); preempt_enable(); return sprintf(page, - "%8u %8u %8llu %8u " - "%8u %8u %8llu %8u " + "%8lu %8lu %8llu %8u " + "%8lu %8lu %8llu %8u " "%8u %8u %8u" "\n", disk_stat_read(disk, ios[READ]), @@ -649,7 +649,7 @@ static int diskstats_show(struct seq_file *s, void *v) preempt_disable(); disk_round_stats(gp); preempt_enable(); - seq_printf(s, "%4d %4d %s %u %u %llu %u %u %u %llu %u %u %u %u\n", + seq_printf(s, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n", gp->major, n + gp->first_minor, disk_name(gp, n, buf), disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]), (unsigned long long)disk_stat_read(gp, sectors[0]), diff --git a/include/linux/genhd.h b/include/linux/genhd.h index fd647fde5ec1..179fea53fc81 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -89,12 +89,12 @@ struct hd_struct { #define GENHD_FL_SUPPRESS_PARTITION_INFO 32 struct disk_stats { - unsigned sectors[2]; /* READs and WRITEs */ - unsigned ios[2]; - unsigned merges[2]; - unsigned ticks[2]; - unsigned io_ticks; - unsigned time_in_queue; + unsigned long sectors[2]; /* READs and WRITEs */ + unsigned long ios[2]; + unsigned long merges[2]; + unsigned long ticks[2]; + unsigned long io_ticks; + unsigned long time_in_queue; }; struct gendisk { From 89a7689e5c039090d99cbdc3625252c3dee50f7f Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Fri, 24 Mar 2006 10:00:57 +0100 Subject: [PATCH 022/306] [PATCH] unused label in drivers/block/cciss. this patch removes a warning about an unused label, by moving the label into the ifdef. Signed-off-by: Eric Sesterhenn Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 168da0c9ed3d..1b0fd31c57c3 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -2727,9 +2727,9 @@ static void __devinit cciss_interrupt_mode(ctlr_info_t *c, struct pci_dev *pdev, return; } } +default_int_mode: #endif /* CONFIG_PCI_MSI */ /* if we get here we're going to use the default interrupt mode */ -default_int_mode: c->intr[SIMPLE_MODE_INT] = pdev->irq; return; } From 09540e691d7f57684b296b60241b2ff357fae3ab Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 26 Mar 2006 14:32:09 +0200 Subject: [PATCH 023/306] [PATCH] Fix blktrace compile with sysfs not defined debugfs depends on sysfs, so make blktrace kconfig option depend on that. Reported by Adrian Bunk. Signed-off-by: Jens Axboe --- block/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/block/Kconfig b/block/Kconfig index 43ca070dc0f8..5536839886ff 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -13,6 +13,7 @@ config LBD config BLK_DEV_IO_TRACE bool "Support for tracing block io actions" + depends on SYSFS select RELAY select DEBUG_FS help From ae36b883d29e53b6083ed3d1d44f254cee7507d3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 26 Mar 2006 14:38:54 +0200 Subject: [PATCH 024/306] [PATCH] Don't make debugfs depend on DEBUG_KERNEL We use it generally now, at least blktrace isn't a specific debug kernel feature. Signed-off-by: Jens Axboe --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7e70ab13e191..6e8a60f67c7a 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -172,7 +172,7 @@ config DEBUG_IOREMAP config DEBUG_FS bool "Debug Filesystem" - depends on DEBUG_KERNEL && SYSFS + depends on SYSFS help debugfs is a virtual file system that kernel developers use to put debugging files into. Enable this option to be able to read and From 7bebd83dbf096d0bf4b4bfbaf5d8844a05f5eafc Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 27 Mar 2006 01:07:55 -0800 Subject: [PATCH 025/306] [SPARC64]: Fix off-by-1 error in TSB grow check. The worst part about this bug is what it would cause a hugepage TSB to be allocated for every address space since "0 >= 0". Signed-off-by: David S. Miller --- arch/sparc64/mm/fault.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sparc64/mm/fault.c b/arch/sparc64/mm/fault.c index d21ff3230c02..0db2f7d9fab5 100644 --- a/arch/sparc64/mm/fault.c +++ b/arch/sparc64/mm/fault.c @@ -413,12 +413,12 @@ good_area: #ifdef CONFIG_HUGETLB_PAGE mm_rss -= (mm->context.huge_pte_count * (HPAGE_SIZE / PAGE_SIZE)); #endif - if (unlikely(mm_rss >= + if (unlikely(mm_rss > mm->context.tsb_block[MM_TSB_BASE].tsb_rss_limit)) tsb_grow(mm, MM_TSB_BASE, mm_rss); #ifdef CONFIG_HUGETLB_PAGE mm_rss = mm->context.huge_pte_count; - if (unlikely(mm_rss >= + if (unlikely(mm_rss > mm->context.tsb_block[MM_TSB_HUGE].tsb_rss_limit)) tsb_grow(mm, MM_TSB_HUGE, mm_rss); #endif From c8c4b939e8dff82ee2bc22ed506ae29ecffca4c2 Mon Sep 17 00:00:00 2001 From: Erik Mouw Date: Mon, 27 Mar 2006 15:32:30 +0100 Subject: [PATCH 026/306] [ARM] 3416/1: Update LART site URL Patch from Erik Mouw The LART website moved to http://www.lartmaker.nl/. This patch updates the URL in ARM specific files. Signed-off-by: Erik Mouw Acked-by: Jan-Derk Bakker Signed-off-by: Russell King --- Documentation/arm/SA1100/Assabet | 2 +- Documentation/arm/SA1100/LART | 2 +- arch/arm/mach-sa1100/Kconfig | 2 +- arch/arm/mach-sa1100/cpu-sa1100.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/arm/SA1100/Assabet b/Documentation/arm/SA1100/Assabet index cbbe5587c78d..78bc1c1b04e5 100644 --- a/Documentation/arm/SA1100/Assabet +++ b/Documentation/arm/SA1100/Assabet @@ -26,7 +26,7 @@ Installing a bootloader A couple of bootloaders able to boot Linux on Assabet are available: -BLOB (http://www.lart.tudelft.nl/lartware/blob/) +BLOB (http://www.lartmaker.nl/lartware/blob/) BLOB is a bootloader used within the LART project. Some contributed patches were merged into BLOB to add support for Assabet. diff --git a/Documentation/arm/SA1100/LART b/Documentation/arm/SA1100/LART index 2f73f513e16a..6d412b685598 100644 --- a/Documentation/arm/SA1100/LART +++ b/Documentation/arm/SA1100/LART @@ -11,4 +11,4 @@ is under development, with plenty of others in different stages of planning. The hardware designs for this board have been released under an open license; -see the LART page at http://www.lart.tudelft.nl/ for more information. +see the LART page at http://www.lartmaker.nl/ for more information. diff --git a/arch/arm/mach-sa1100/Kconfig b/arch/arm/mach-sa1100/Kconfig index 6923316b3d0d..cd67ab1b217b 100644 --- a/arch/arm/mach-sa1100/Kconfig +++ b/arch/arm/mach-sa1100/Kconfig @@ -111,7 +111,7 @@ config SA1100_LART bool "LART" help Say Y here if you are using the Linux Advanced Radio Terminal - (also known as the LART). See for + (also known as the LART). See for information on the LART. config SA1100_PLEB diff --git a/arch/arm/mach-sa1100/cpu-sa1100.c b/arch/arm/mach-sa1100/cpu-sa1100.c index 6435b2e48ffa..d68630b74d78 100644 --- a/arch/arm/mach-sa1100/cpu-sa1100.c +++ b/arch/arm/mach-sa1100/cpu-sa1100.c @@ -11,7 +11,7 @@ * linux-2.4.5-rmk1 * * This software has been developed while working on the LART - * computing board (http://www.lart.tudelft.nl/), which is + * computing board (http://www.lartmaker.nl/), which is * sponsored by the Mobile Multi-media Communications * (http://www.mmc.tudelft.nl/) and Ubiquitous Communications * (http://www.ubicom.tudelft.nl/) projects. From 4682adcfb06448827fbdfd8b6c636796de569b7d Mon Sep 17 00:00:00 2001 From: "Hyok S. Choi" Date: Mon, 27 Mar 2006 15:46:06 +0100 Subject: [PATCH 027/306] [ARM] nommu: trivial patch for arch/arm/lib/Makefile ifeq ($CONFIG_PREEMPT,y) -> ifeq ($(CONFIG_PREEMPT),y) Signed-off-by: Hyok S. Choi Signed-off-by: Russell King --- arch/arm/lib/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index 391f3ab3ff32..7b726b627ea5 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile @@ -18,7 +18,7 @@ lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \ # the code in uaccess.S is not preemption safe and # probably faster on ARMv3 only -ifeq ($CONFIG_PREEMPT,y) +ifeq ($(CONFIG_PREEMPT),y) lib-y += copy_from_user.o copy_to_user.o else ifneq ($(CONFIG_CPU_32v3),y) From 3747b36eeab93d8969e86987bbc1d44971229b26 Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 27 Mar 2006 16:59:07 +0100 Subject: [PATCH 028/306] [ARM] proc-v6: mark page table walks outer-cacheable, shared. Enable NX. Mark page table walks with outer-cacheable attribute, and enable no-execute in page tables. Signed-off-by: Russell King --- arch/arm/mm/proc-v6.S | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/arm/mm/proc-v6.S b/arch/arm/mm/proc-v6.S index 9a7e7c096aa9..ee6f15298735 100644 --- a/arch/arm/mm/proc-v6.S +++ b/arch/arm/mm/proc-v6.S @@ -21,6 +21,14 @@ #define D_CACHE_LINE_SIZE 32 +#define TTB_C (1 << 0) +#define TTB_S (1 << 1) +#define TTB_IMP (1 << 2) +#define TTB_RGN_NC (0 << 3) +#define TTB_RGN_WBWA (1 << 3) +#define TTB_RGN_WT (2 << 3) +#define TTB_RGN_WB (3 << 3) + .macro cpsie, flags .ifc \flags, f .long 0xf1080040 @@ -115,7 +123,7 @@ ENTRY(cpu_v6_switch_mm) mov r2, #0 ldr r1, [r1, #MM_CONTEXT_ID] @ get mm->context.id #ifdef CONFIG_SMP - orr r0, r0, #2 @ set shared pgtable + orr r0, r0, #TTB_RGN_WBWA|TTB_S @ mark PTWs shared, outer cacheable #endif mcr p15, 0, r2, c7, c5, 6 @ flush BTAC/BTB mcr p15, 0, r2, c7, c10, 4 @ drain write buffer @@ -161,8 +169,8 @@ ENTRY(cpu_v6_set_pte) tst r1, #L_PTE_YOUNG biceq r2, r2, #PTE_EXT_APX | PTE_EXT_AP_MASK -@ tst r1, #L_PTE_EXEC -@ orreq r2, r2, #PTE_EXT_XN + tst r1, #L_PTE_EXEC + orreq r2, r2, #PTE_EXT_XN tst r1, #L_PTE_PRESENT moveq r2, #0 @@ -221,7 +229,7 @@ __v6_setup: mcr p15, 0, r0, c8, c7, 0 @ invalidate I + D TLBs mcr p15, 0, r0, c2, c0, 2 @ TTB control register #ifdef CONFIG_SMP - orr r4, r4, #2 @ set shared pgtable + orr r4, r4, #TTB_RGN_WBWA|TTB_S @ mark PTWs shared, outer cacheable #endif mcr p15, 0, r4, c2, c0, 1 @ load TTB1 #ifdef CONFIG_VFP From dbffa471611d3fc4b401ebabf7bb63ac0e0272b1 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Mon, 27 Mar 2006 01:14:26 -0800 Subject: [PATCH 029/306] [PATCH] PM-Timer: don't use workaround if chipset is not buggy Current timer_pm.c reads I/O port triple times, in order to avoid the bug of chipset. But I/O port is slow. 2.6.16 (pmtmr) Simple gettimeofday: 3.6532 microseconds 2.6.16+patch (pmtmr) Simple gettimeofday: 1.4582 microseconds [if chip is buggy, probably it will be 7us or more in 4.2% of probability.] This patch adds blacklist of buggy chip, and if chip is not buggy, this uses fast normal version instead of slow workaround version. If chip is buggy, warnings "pmtmr is slow". But sounds like there is gray zone. I found the PIIX4 errata, but I couldn't find the ICH4 errata. But some motherboard seems to have problem. So, if we found a ICH4, generate warnings, and use a workaround version. If user's ICH4 is good, the user can specify the "pmtmr_good" boot parameter to use fast version. Acked-by: John Stultz Signed-off-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/timers/timer_pm.c | 102 +++++++++++++++++++++++++---- 1 file changed, 88 insertions(+), 14 deletions(-) diff --git a/arch/i386/kernel/timers/timer_pm.c b/arch/i386/kernel/timers/timer_pm.c index 264edaaac315..144e94a04933 100644 --- a/arch/i386/kernel/timers/timer_pm.c +++ b/arch/i386/kernel/timers/timer_pm.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -45,24 +46,31 @@ static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; #define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ +static int pmtmr_need_workaround __read_mostly = 1; + /*helper function to safely read acpi pm timesource*/ static inline u32 read_pmtmr(void) { - u32 v1=0,v2=0,v3=0; - /* It has been reported that because of various broken - * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM time - * source is not latched, so you must read it multiple - * times to insure a safe value is read. - */ - do { - v1 = inl(pmtmr_ioport); - v2 = inl(pmtmr_ioport); - v3 = inl(pmtmr_ioport); - } while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1) - || (v3 > v1 && v3 < v2)); + if (pmtmr_need_workaround) { + u32 v1, v2, v3; - /* mask the output to 24 bits */ - return v2 & ACPI_PM_MASK; + /* It has been reported that because of various broken + * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM time + * source is not latched, so you must read it multiple + * times to insure a safe value is read. + */ + do { + v1 = inl(pmtmr_ioport); + v2 = inl(pmtmr_ioport); + v3 = inl(pmtmr_ioport); + } while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1) + || (v3 > v1 && v3 < v2)); + + /* mask the output to 24 bits */ + return v2 & ACPI_PM_MASK; + } + + return inl(pmtmr_ioport) & ACPI_PM_MASK; } @@ -263,6 +271,72 @@ struct init_timer_opts __initdata timer_pmtmr_init = { .opts = &timer_pmtmr, }; +#ifdef CONFIG_PCI +/* + * PIIX4 Errata: + * + * The power management timer may return improper results when read. + * Although the timer value settles properly after incrementing, + * while incrementing there is a 3 ns window every 69.8 ns where the + * timer value is indeterminate (a 4.2% chance that the data will be + * incorrect when read). As a result, the ACPI free running count up + * timer specification is violated due to erroneous reads. + */ +static int __init pmtmr_bug_check(void) +{ + static struct pci_device_id gray_list[] __initdata = { + /* these chipsets may have bug. */ + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_82801DB_0) }, + { }, + }; + struct pci_dev *dev; + int pmtmr_has_bug = 0; + u8 rev; + + if (cur_timer != &timer_pmtmr || !pmtmr_need_workaround) + return 0; + + dev = pci_get_device(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_82371AB_3, NULL); + if (dev) { + pci_read_config_byte(dev, PCI_REVISION_ID, &rev); + /* the bug has been fixed in PIIX4M */ + if (rev < 3) { + printk(KERN_WARNING "* Found PM-Timer Bug on this " + "chipset. Due to workarounds for a bug,\n" + "* this time source is slow. Consider trying " + "other time sources (clock=)\n"); + pmtmr_has_bug = 1; + } + pci_dev_put(dev); + } + + if (pci_dev_present(gray_list)) { + printk(KERN_WARNING "* This chipset may have PM-Timer Bug. Due" + " to workarounds for a bug,\n" + "* this time source is slow. If you are sure your timer" + " does not have\n" + "* this bug, please use \"pmtmr_good\" to disable the " + "workaround\n"); + pmtmr_has_bug = 1; + } + + if (!pmtmr_has_bug) + pmtmr_need_workaround = 0; + + return 0; +} +device_initcall(pmtmr_bug_check); +#endif + +static int __init pmtr_good_setup(char *__str) +{ + pmtmr_need_workaround = 0; + return 1; +} +__setup("pmtmr_good", pmtr_good_setup); + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Dominik Brodowski "); MODULE_DESCRIPTION("Power Management Timer (PMTMR) as primary timing source for x86"); From bb83da053319e81906473bec08e8f677d6ac615f Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 27 Mar 2006 01:14:27 -0800 Subject: [PATCH 030/306] [PATCH] uml: fix build warnings in __get_user Fix a gcc warning about losing qualifiers to the first argument of copy_from_user. The typeof change for correctness, and fixes a lot of the warnings, but there are some cases where x has some extra qualifiers, like volatile, which copy_from_user can't know about. For these, the void * cast seems to be necessary. Also cleaned up some of the whitespace and got rid of the emacs comment at the bottom. Signed-off-by: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-um/uaccess.h | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/include/asm-um/uaccess.h b/include/asm-um/uaccess.h index 2ee028b8de9d..4e460d6f5ac8 100644 --- a/include/asm-um/uaccess.h +++ b/include/asm-um/uaccess.h @@ -41,16 +41,16 @@ #define __get_user(x, ptr) \ ({ \ - const __typeof__(ptr) __private_ptr = ptr; \ - __typeof__(*(__private_ptr)) __private_val; \ - int __private_ret = -EFAULT; \ - (x) = (__typeof__(*(__private_ptr)))0; \ - if (__copy_from_user(&__private_val, (__private_ptr), \ - sizeof(*(__private_ptr))) == 0) {\ - (x) = (__typeof__(*(__private_ptr))) __private_val; \ - __private_ret = 0; \ - } \ - __private_ret; \ + const __typeof__(ptr) __private_ptr = ptr; \ + __typeof__(x) __private_val; \ + int __private_ret = -EFAULT; \ + (x) = (__typeof__(*(__private_ptr)))0; \ + if (__copy_from_user((void *) &__private_val, (__private_ptr), \ + sizeof(*(__private_ptr))) == 0) { \ + (x) = (__typeof__(*(__private_ptr))) __private_val; \ + __private_ret = 0; \ + } \ + __private_ret; \ }) #define get_user(x, ptr) \ @@ -89,14 +89,3 @@ struct exception_table_entry }; #endif - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ From c90e12b86595dbd9996336ac9d762487c638d934 Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 27 Mar 2006 01:14:29 -0800 Subject: [PATCH 031/306] [PATCH] uml: fix declaration of exit() This fixes a conflict between a header and what gcc "knows" the declaration' to be. Signed-off-by: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/include/kern.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/include/kern.h b/arch/um/include/kern.h index 7d223beccbc0..4ce3fc650e57 100644 --- a/arch/um/include/kern.h +++ b/arch/um/include/kern.h @@ -29,7 +29,7 @@ extern int getuid(void); extern int getgid(void); extern int pause(void); extern int write(int, const void *, int); -extern int exit(int); +extern void exit(int); extern int close(int); extern int read(unsigned int, char *, int); extern int pipe(int *); From d9f8b62a6b033fe7226d7c2b2efdd164ca1aa07c Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 27 Mar 2006 01:14:29 -0800 Subject: [PATCH 032/306] [PATCH] uml: fix some printf formats Some printf formats are incorrect for large memory sizes. Signed-off-by: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/kernel/um_arch.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 80c9c18aae94..0c5b9dcfbe2e 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -421,7 +421,7 @@ int linux_main(int argc, char **argv) #ifndef CONFIG_HIGHMEM highmem = 0; printf("CONFIG_HIGHMEM not enabled - physical memory shrunk " - "to %lu bytes\n", physmem_size); + "to %Lu bytes\n", physmem_size); #endif } @@ -433,8 +433,8 @@ int linux_main(int argc, char **argv) setup_physmem(uml_physmem, uml_reserved, physmem_size, highmem); if(init_maps(physmem_size, iomem_size, highmem)){ - printf("Failed to allocate mem_map for %lu bytes of physical " - "memory and %lu bytes of highmem\n", physmem_size, + printf("Failed to allocate mem_map for %Lu bytes of physical " + "memory and %Lu bytes of highmem\n", physmem_size, highmem); exit(1); } From 63ae2a94d98dd9d94163918539ec80df33f44a69 Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 27 Mar 2006 01:14:30 -0800 Subject: [PATCH 033/306] [PATCH] uml: move libc-dependent irq code to os-Linux The serial UML OS-abstraction layer patch (um/kernel dir). This moves all systemcalls from irq_user.c file under os-Linux dir Signed-off-by: Gennady Sharapov Signed-off-by: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/include/irq_user.h | 15 ++- arch/um/include/misc_constants.h | 6 + arch/um/include/os.h | 16 +++ arch/um/kernel/irq_user.c | 208 +++++++++--------------------- arch/um/kernel/smp.c | 14 +- arch/um/os-Linux/Makefile | 6 +- arch/um/os-Linux/irq.c | 162 +++++++++++++++++++++++ arch/um/os-Linux/tt.c | 10 ++ arch/um/sys-i386/user-offsets.c | 10 +- arch/um/sys-x86_64/user-offsets.c | 6 + 10 files changed, 290 insertions(+), 163 deletions(-) create mode 100644 arch/um/include/misc_constants.h create mode 100644 arch/um/os-Linux/irq.c diff --git a/arch/um/include/irq_user.h b/arch/um/include/irq_user.h index b61deb8b362a..69a93c804f0e 100644 --- a/arch/um/include/irq_user.h +++ b/arch/um/include/irq_user.h @@ -1,4 +1,4 @@ -/* +/* * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) * Licensed under the GPL */ @@ -6,6 +6,17 @@ #ifndef __IRQ_USER_H__ #define __IRQ_USER_H__ +struct irq_fd { + struct irq_fd *next; + void *id; + int fd; + int type; + int irq; + int pid; + int events; + int current_events; +}; + enum { IRQ_READ, IRQ_WRITE }; extern void sigio_handler(int sig, union uml_pt_regs *regs); @@ -16,8 +27,6 @@ extern void reactivate_fd(int fd, int irqnum); extern void deactivate_fd(int fd, int irqnum); extern int deactivate_all_fds(void); extern void forward_interrupts(int pid); -extern void init_irq_signals(int on_sigstack); -extern void forward_ipi(int fd, int pid); extern int activate_ipi(int fd, int pid); extern unsigned long irq_lock(void); extern void irq_unlock(unsigned long flags); diff --git a/arch/um/include/misc_constants.h b/arch/um/include/misc_constants.h new file mode 100644 index 000000000000..989bc08de36e --- /dev/null +++ b/arch/um/include/misc_constants.h @@ -0,0 +1,6 @@ +#ifndef __MISC_CONSTANT_H_ +#define __MISC_CONSTANT_H_ + +#include + +#endif diff --git a/arch/um/include/os.h b/arch/um/include/os.h index 2a1c64d8d0bf..efae3e6acb5b 100644 --- a/arch/um/include/os.h +++ b/arch/um/include/os.h @@ -12,6 +12,7 @@ #include "sysdep/ptrace.h" #include "kern_util.h" #include "skas/mm_id.h" +#include "irq_user.h" #define OS_TYPE_FILE 1 #define OS_TYPE_DIR 2 @@ -198,6 +199,8 @@ extern void os_flush_stdout(void); /* tt.c * for tt mode only (will be deleted in future...) */ +extern void forward_ipi(int fd, int pid); +extern void kill_child_dead(int pid); extern void stop(void); extern int wait_for_stop(int pid, int sig, int cont_type, void *relay); extern int protect_memory(unsigned long addr, unsigned long len, @@ -294,4 +297,17 @@ extern void initial_thread_cb_skas(void (*proc)(void *), extern void halt_skas(void); extern void reboot_skas(void); +/* irq.c */ +extern int os_waiting_for_events(struct irq_fd *active_fds); +extern int os_isatty(int fd); +extern int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds); +extern void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg, + struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2); +extern void os_free_irq_later(struct irq_fd *active_fds, + int irq, void *dev_id); +extern int os_get_pollfd(int i); +extern void os_set_pollfd(int i, int fd); +extern void os_set_ioignore(void); +extern void init_irq_signals(int on_sigstack); + #endif diff --git a/arch/um/kernel/irq_user.c b/arch/um/kernel/irq_user.c index 0e32f5f4a887..483e79c5ab98 100644 --- a/arch/um/kernel/irq_user.c +++ b/arch/um/kernel/irq_user.c @@ -1,4 +1,4 @@ -/* +/* * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) * Licensed under the GPL */ @@ -18,53 +18,25 @@ #include "sigio.h" #include "irq_user.h" #include "os.h" +#include "misc_constants.h" -struct irq_fd { - struct irq_fd *next; - void *id; - int fd; - int type; - int irq; - int pid; - int events; - int current_events; -}; - -static struct irq_fd *active_fds = NULL; +struct irq_fd *active_fds = NULL; static struct irq_fd **last_irq_ptr = &active_fds; -static struct pollfd *pollfds = NULL; -static int pollfds_num = 0; -static int pollfds_size = 0; - -extern int io_count, intr_count; - extern void free_irqs(void); void sigio_handler(int sig, union uml_pt_regs *regs) { struct irq_fd *irq_fd; - int i, n; + int n; if(smp_sigio_handler()) return; while(1){ - n = poll(pollfds, pollfds_num, 0); - if(n < 0){ - if(errno == EINTR) continue; - printk("sigio_handler : poll returned %d, " - "errno = %d\n", n, errno); - break; - } - if(n == 0) break; - - irq_fd = active_fds; - for(i = 0; i < pollfds_num; i++){ - if(pollfds[i].revents != 0){ - irq_fd->current_events = pollfds[i].revents; - pollfds[i].fd = -1; - } - irq_fd = irq_fd->next; - } + n = os_waiting_for_events(active_fds); + if (n <= 0) { + if(n == -EINTR) continue; + else break; + } for(irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next){ if(irq_fd->current_events != 0){ @@ -77,14 +49,9 @@ void sigio_handler(int sig, union uml_pt_regs *regs) free_irqs(); } -int activate_ipi(int fd, int pid) -{ - return(os_set_fd_async(fd, pid)); -} - static void maybe_sigio_broken(int fd, int type) { - if(isatty(fd)){ + if(os_isatty(fd)){ if((type == IRQ_WRITE) && !pty_output_sigio){ write_sigio_workaround(); add_sigio_fd(fd, 0); @@ -96,12 +63,13 @@ static void maybe_sigio_broken(int fd, int type) } } + int activate_fd(int irq, int fd, int type, void *dev_id) { struct pollfd *tmp_pfd; struct irq_fd *new_fd, *irq_fd; unsigned long flags; - int pid, events, err, n, size; + int pid, events, err, n; pid = os_getpid(); err = os_set_fd_async(fd, pid); @@ -113,8 +81,8 @@ int activate_fd(int irq, int fd, int type, void *dev_id) if(new_fd == NULL) goto out; - if(type == IRQ_READ) events = POLLIN | POLLPRI; - else events = POLLOUT; + if(type == IRQ_READ) events = UM_POLLIN | UM_POLLPRI; + else events = UM_POLLOUT; *new_fd = ((struct irq_fd) { .next = NULL, .id = dev_id, .fd = fd, @@ -125,12 +93,12 @@ int activate_fd(int irq, int fd, int type, void *dev_id) .current_events = 0 } ); /* Critical section - locked by a spinlock because this stuff can - * be changed from interrupt handlers. The stuff above is done + * be changed from interrupt handlers. The stuff above is done * outside the lock because it allocates memory. */ /* Actually, it only looks like it can be called from interrupt - * context. The culprit is reactivate_fd, which calls + * context. The culprit is reactivate_fd, which calls * maybe_sigio_broken, which calls write_sigio_workaround, * which calls activate_fd. However, write_sigio_workaround should * only be called once, at boot time. That would make it clear that @@ -147,40 +115,42 @@ int activate_fd(int irq, int fd, int type, void *dev_id) } } - n = pollfds_num; - if(n == pollfds_size){ - while(1){ - /* Here we have to drop the lock in order to call - * kmalloc, which might sleep. If something else - * came in and changed the pollfds array, we free - * the buffer and try again. - */ - irq_unlock(flags); - size = (pollfds_num + 1) * sizeof(pollfds[0]); - tmp_pfd = um_kmalloc(size); - flags = irq_lock(); - if(tmp_pfd == NULL) - goto out_unlock; - if(n == pollfds_size) - break; - kfree(tmp_pfd); - } - if(pollfds != NULL){ - memcpy(tmp_pfd, pollfds, - sizeof(pollfds[0]) * pollfds_size); - kfree(pollfds); - } - pollfds = tmp_pfd; - pollfds_size++; - } - - if(type == IRQ_WRITE) + /*-------------*/ + if(type == IRQ_WRITE) fd = -1; - pollfds[pollfds_num] = ((struct pollfd) { .fd = fd, - .events = events, - .revents = 0 }); - pollfds_num++; + tmp_pfd = NULL; + n = 0; + + while(1){ + n = os_create_pollfd(fd, events, tmp_pfd, n); + if (n == 0) + break; + + /* n > 0 + * It means we couldn't put new pollfd to current pollfds + * and tmp_fds is NULL or too small for new pollfds array. + * Needed size is equal to n as minimum. + * + * Here we have to drop the lock in order to call + * kmalloc, which might sleep. + * If something else came in and changed the pollfds array + * so we will not be able to put new pollfd struct to pollfds + * then we free the buffer tmp_fds and try again. + */ + irq_unlock(flags); + if (tmp_pfd != NULL) { + kfree(tmp_pfd); + tmp_pfd = NULL; + } + + tmp_pfd = um_kmalloc(n); + if (tmp_pfd == NULL) + goto out_kfree; + + flags = irq_lock(); + } + /*-------------*/ *last_irq_ptr = new_fd; last_irq_ptr = &new_fd->next; @@ -196,6 +166,7 @@ int activate_fd(int irq, int fd, int type, void *dev_id) out_unlock: irq_unlock(flags); + out_kfree: kfree(new_fd); out: return(err); @@ -203,43 +174,10 @@ int activate_fd(int irq, int fd, int type, void *dev_id) static void free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg) { - struct irq_fd **prev; unsigned long flags; - int i = 0; flags = irq_lock(); - prev = &active_fds; - while(*prev != NULL){ - if((*test)(*prev, arg)){ - struct irq_fd *old_fd = *prev; - if((pollfds[i].fd != -1) && - (pollfds[i].fd != (*prev)->fd)){ - printk("free_irq_by_cb - mismatch between " - "active_fds and pollfds, fd %d vs %d\n", - (*prev)->fd, pollfds[i].fd); - goto out; - } - - pollfds_num--; - - /* This moves the *whole* array after pollfds[i] (though - * it doesn't spot as such)! */ - - memmove(&pollfds[i], &pollfds[i + 1], - (pollfds_num - i) * sizeof(pollfds[0])); - - if(last_irq_ptr == &old_fd->next) - last_irq_ptr = prev; - *prev = (*prev)->next; - if(old_fd->type == IRQ_WRITE) - ignore_sigio_fd(old_fd->fd); - kfree(old_fd); - continue; - } - prev = &(*prev)->next; - i++; - } - out: + os_free_irq_by_cb(test, arg, active_fds, &last_irq_ptr); irq_unlock(flags); } @@ -277,6 +215,7 @@ static struct irq_fd *find_irq_by_fd(int fd, int irqnum, int *index_out) { struct irq_fd *irq; int i = 0; + int fdi; for(irq=active_fds; irq != NULL; irq = irq->next){ if((irq->fd == fd) && (irq->irq == irqnum)) break; @@ -286,10 +225,11 @@ static struct irq_fd *find_irq_by_fd(int fd, int irqnum, int *index_out) printk("find_irq_by_fd doesn't have descriptor %d\n", fd); goto out; } - if((pollfds[i].fd != -1) && (pollfds[i].fd != fd)){ + fdi = os_get_pollfd(i); + if((fdi != -1) && (fdi != fd)){ printk("find_irq_by_fd - mismatch between active_fds and " - "pollfds, fd %d vs %d, need %d\n", irq->fd, - pollfds[i].fd, fd); + "pollfds, fd %d vs %d, need %d\n", irq->fd, + fdi, fd); irq = NULL; goto out; } @@ -310,9 +250,7 @@ void reactivate_fd(int fd, int irqnum) irq_unlock(flags); return; } - - pollfds[i].fd = irq->fd; - + os_set_pollfd(i, irq->fd); irq_unlock(flags); /* This calls activate_fd, so it has to be outside the critical @@ -331,7 +269,7 @@ void deactivate_fd(int fd, int irqnum) irq = find_irq_by_fd(fd, irqnum, &i); if(irq == NULL) goto out; - pollfds[i].fd = -1; + os_set_pollfd(i, -1); out: irq_unlock(flags); } @@ -347,21 +285,11 @@ int deactivate_all_fds(void) return(err); } /* If there is a signal already queued, after unblocking ignore it */ - set_handler(SIGIO, SIG_IGN, 0, -1); + os_set_ioignore(); return(0); } -void forward_ipi(int fd, int pid) -{ - int err; - - err = os_set_owner(fd, pid); - if(err < 0) - printk("forward_ipi: set_owner failed, fd = %d, me = %d, " - "target = %d, err = %d\n", fd, os_getpid(), pid, -err); -} - void forward_interrupts(int pid) { struct irq_fd *irq; @@ -384,22 +312,6 @@ void forward_interrupts(int pid) irq_unlock(flags); } -void init_irq_signals(int on_sigstack) -{ - __sighandler_t h; - int flags; - - flags = on_sigstack ? SA_ONSTACK : 0; - if(timer_irq_inited) h = (__sighandler_t) alarm_handler; - else h = boot_timer_handler; - - set_handler(SIGVTALRM, h, flags | SA_RESTART, - SIGUSR1, SIGIO, SIGWINCH, SIGALRM, -1); - set_handler(SIGIO, (__sighandler_t) sig_handler, flags | SA_RESTART, - SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); - signal(SIGWINCH, SIG_IGN); -} - /* * Overrides for Emacs so that we follow Linus's tabbing style. * Emacs will notice this stuff at the end of the file and automatically diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c index 72113b0a96e7..c8d8d0ac1a7f 100644 --- a/arch/um/kernel/smp.c +++ b/arch/um/kernel/smp.c @@ -1,4 +1,4 @@ -/* +/* * Copyright (C) 2000 - 2003 Jeff Dike (jdike@addtoit.com) * Licensed under the GPL */ @@ -77,9 +77,9 @@ static int idle_proc(void *cpup) if(err < 0) panic("CPU#%d failed to create IPI pipe, err = %d", cpu, -err); - activate_ipi(cpu_data[cpu].ipi_pipe[0], + os_set_fd_async(cpu_data[cpu].ipi_pipe[0], current->thread.mode.tt.extern_pid); - + wmb(); if (cpu_test_and_set(cpu, cpu_callin_map)) { printk("huh, CPU#%d already present??\n", cpu); @@ -106,7 +106,7 @@ static struct task_struct *idle_thread(int cpu) panic("copy_process failed in idle_thread, error = %ld", PTR_ERR(new_task)); - cpu_tasks[cpu] = ((struct cpu_task) + cpu_tasks[cpu] = ((struct cpu_task) { .pid = new_task->thread.mode.tt.extern_pid, .task = new_task } ); idle_threads[cpu] = new_task; @@ -134,12 +134,12 @@ void smp_prepare_cpus(unsigned int maxcpus) if(err < 0) panic("CPU#0 failed to create IPI pipe, errno = %d", -err); - activate_ipi(cpu_data[me].ipi_pipe[0], + os_set_fd_async(cpu_data[me].ipi_pipe[0], current->thread.mode.tt.extern_pid); for(cpu = 1; cpu < ncpus; cpu++){ printk("Booting processor %d...\n", cpu); - + idle = idle_thread(cpu); init_idle(idle, cpu); @@ -223,7 +223,7 @@ void smp_call_function_slave(int cpu) atomic_inc(&scf_finished); } -int smp_call_function(void (*_func)(void *info), void *_info, int nonatomic, +int smp_call_function(void (*_func)(void *info), void *_info, int nonatomic, int wait) { int cpus = num_online_cpus() - 1; diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile index 08a4e628b24c..73d3d837ed6b 100644 --- a/arch/um/os-Linux/Makefile +++ b/arch/um/os-Linux/Makefile @@ -3,14 +3,14 @@ # Licensed under the GPL # -obj-y = aio.o elf_aux.o file.o helper.o main.o mem.o process.o signal.o \ +obj-y = aio.o elf_aux.o file.o helper.o irq.o main.o mem.o process.o signal.o \ start_up.o time.o trap.o tt.o tty.o uaccess.o umid.o user_syms.o \ util.o drivers/ sys-$(SUBARCH)/ obj-$(CONFIG_MODE_SKAS) += skas/ -USER_OBJS := aio.o elf_aux.o file.o helper.o main.o mem.o process.o signal.o \ - start_up.o time.o trap.o tt.o tty.o uaccess.o umid.o util.o +USER_OBJS := aio.o elf_aux.o file.o helper.o irq.o main.o mem.o process.o \ + signal.o start_up.o time.o trap.o tt.o tty.o uaccess.o umid.o util.o elf_aux.o: $(ARCH_DIR)/kernel-offsets.h CFLAGS_elf_aux.o += -I$(objtree)/arch/um diff --git a/arch/um/os-Linux/irq.c b/arch/um/os-Linux/irq.c new file mode 100644 index 000000000000..e599be423da1 --- /dev/null +++ b/arch/um/os-Linux/irq.c @@ -0,0 +1,162 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "user_util.h" +#include "kern_util.h" +#include "user.h" +#include "process.h" +#include "sigio.h" +#include "irq_user.h" +#include "os.h" + +static struct pollfd *pollfds = NULL; +static int pollfds_num = 0; +static int pollfds_size = 0; + +int os_waiting_for_events(struct irq_fd *active_fds) +{ + struct irq_fd *irq_fd; + int i, n, err; + + n = poll(pollfds, pollfds_num, 0); + if(n < 0){ + err = -errno; + if(errno != EINTR) + printk("sigio_handler: os_waiting_for_events:" + " poll returned %d, errno = %d\n", n, errno); + return err; + } + + if(n == 0) + return 0; + + irq_fd = active_fds; + + for(i = 0; i < pollfds_num; i++){ + if(pollfds[i].revents != 0){ + irq_fd->current_events = pollfds[i].revents; + pollfds[i].fd = -1; + } + irq_fd = irq_fd->next; + } + return n; +} + +int os_isatty(int fd) +{ + return(isatty(fd)); +} + +int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds) +{ + if (pollfds_num == pollfds_size) { + if (size_tmpfds <= pollfds_size * sizeof(pollfds[0])) { + /* return min size needed for new pollfds area */ + return((pollfds_size + 1) * sizeof(pollfds[0])); + } + + if(pollfds != NULL){ + memcpy(tmp_pfd, pollfds, + sizeof(pollfds[0]) * pollfds_size); + /* remove old pollfds */ + kfree(pollfds); + } + pollfds = tmp_pfd; + pollfds_size++; + } else { + /* remove not used tmp_pfd */ + if (tmp_pfd != NULL) + kfree(tmp_pfd); + } + + pollfds[pollfds_num] = ((struct pollfd) { .fd = fd, + .events = events, + .revents = 0 }); + pollfds_num++; + + return(0); +} + +void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg, + struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2) +{ + struct irq_fd **prev; + int i = 0; + + prev = &active_fds; + while(*prev != NULL){ + if((*test)(*prev, arg)){ + struct irq_fd *old_fd = *prev; + if((pollfds[i].fd != -1) && + (pollfds[i].fd != (*prev)->fd)){ + printk("os_free_irq_by_cb - mismatch between " + "active_fds and pollfds, fd %d vs %d\n", + (*prev)->fd, pollfds[i].fd); + goto out; + } + + pollfds_num--; + + /* This moves the *whole* array after pollfds[i] + * (though it doesn't spot as such)! + */ + + memmove(&pollfds[i], &pollfds[i + 1], + (pollfds_num - i) * sizeof(pollfds[0])); + if(*last_irq_ptr2 == &old_fd->next) + *last_irq_ptr2 = prev; + + *prev = (*prev)->next; + if(old_fd->type == IRQ_WRITE) + ignore_sigio_fd(old_fd->fd); + kfree(old_fd); + continue; + } + prev = &(*prev)->next; + i++; + } + out: + return; +} + + +int os_get_pollfd(int i) +{ + return(pollfds[i].fd); +} + +void os_set_pollfd(int i, int fd) +{ + pollfds[i].fd = fd; +} + +void os_set_ioignore(void) +{ + set_handler(SIGIO, SIG_IGN, 0, -1); +} + +void init_irq_signals(int on_sigstack) +{ + __sighandler_t h; + int flags; + + flags = on_sigstack ? SA_ONSTACK : 0; + if(timer_irq_inited) h = (__sighandler_t) alarm_handler; + else h = boot_timer_handler; + + set_handler(SIGVTALRM, h, flags | SA_RESTART, + SIGUSR1, SIGIO, SIGWINCH, SIGALRM, -1); + set_handler(SIGIO, (__sighandler_t) sig_handler, flags | SA_RESTART, + SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); + signal(SIGWINCH, SIG_IGN); +} diff --git a/arch/um/os-Linux/tt.c b/arch/um/os-Linux/tt.c index 919d19f11537..5461a065bbb9 100644 --- a/arch/um/os-Linux/tt.c +++ b/arch/um/os-Linux/tt.c @@ -110,6 +110,16 @@ int wait_for_stop(int pid, int sig, int cont_type, void *relay) } } +void forward_ipi(int fd, int pid) +{ + int err; + + err = os_set_owner(fd, pid); + if(err < 0) + printk("forward_ipi: set_owner failed, fd = %d, me = %d, " + "target = %d, err = %d\n", fd, os_getpid(), pid, -err); +} + /* *------------------------- * only for tt mode (will be deleted in future...) diff --git a/arch/um/sys-i386/user-offsets.c b/arch/um/sys-i386/user-offsets.c index 26b68675053d..6f4ef2b7fa4a 100644 --- a/arch/um/sys-i386/user-offsets.c +++ b/arch/um/sys-i386/user-offsets.c @@ -3,12 +3,13 @@ #include #include #include +#include #define DEFINE(sym, val) \ - asm volatile("\n->" #sym " %0 " #val : : "i" (val)) + asm volatile("\n->" #sym " %0 " #val : : "i" (val)) #define DEFINE_LONGS(sym, val) \ - asm volatile("\n->" #sym " %0 " #val : : "i" (val/sizeof(unsigned long))) + asm volatile("\n->" #sym " %0 " #val : : "i" (val/sizeof(unsigned long))) #define OFFSET(sym, str, mem) \ DEFINE(sym, offsetof(struct str, mem)); @@ -67,4 +68,9 @@ void foo(void) DEFINE(HOST_ES, ES); DEFINE(HOST_GS, GS); DEFINE(UM_FRAME_SIZE, sizeof(struct user_regs_struct)); + + /* XXX Duplicated between i386 and x86_64 */ + DEFINE(UM_POLLIN, POLLIN); + DEFINE(UM_POLLPRI, POLLPRI); + DEFINE(UM_POLLOUT, POLLOUT); } diff --git a/arch/um/sys-x86_64/user-offsets.c b/arch/um/sys-x86_64/user-offsets.c index 7bd54a921cf7..899cebb57c3f 100644 --- a/arch/um/sys-x86_64/user-offsets.c +++ b/arch/um/sys-x86_64/user-offsets.c @@ -1,6 +1,7 @@ #include #include #include +#include #define __FRAME_OFFSETS #include #include @@ -88,4 +89,9 @@ void foo(void) DEFINE_LONGS(HOST_IP, RIP); DEFINE_LONGS(HOST_SP, RSP); DEFINE(UM_FRAME_SIZE, sizeof(struct user_regs_struct)); + + /* XXX Duplicated between i386 and x86_64 */ + DEFINE(UM_POLLIN, POLLIN); + DEFINE(UM_POLLPRI, POLLPRI); + DEFINE(UM_POLLOUT, POLLOUT); } From 9b4f018d92dbeff9305b4cf4aa80874c3974cfcc Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 27 Mar 2006 01:14:31 -0800 Subject: [PATCH 034/306] [PATCH] uml: merge irq_user.c and irq.c The serial UML OS-abstraction layer patch (um/kernel dir). This joins irq_user.c and irq.c files. Signed-off-by: Gennady Sharapov Signed-off-by: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/kernel/Makefile | 2 +- arch/um/kernel/irq.c | 294 ++++++++++++++++++++++++++++++++++ arch/um/kernel/irq_user.c | 324 -------------------------------------- 3 files changed, 295 insertions(+), 325 deletions(-) delete mode 100644 arch/um/kernel/irq_user.c diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index 693018ba80f1..8458c30a6265 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -7,7 +7,7 @@ extra-y := vmlinux.lds clean-files := obj-y = config.o exec_kern.o exitcode.o \ - init_task.o irq.o irq_user.o ksyms.o mem.o physmem.o \ + init_task.o irq.o ksyms.o mem.o physmem.o \ process_kern.o ptrace.o reboot.o resource.o sigio_user.o sigio_kern.o \ signal_kern.o smp.o syscall_kern.o sysrq.o \ time_kern.o tlb.o trap_kern.o uaccess.o um_arch.o umid.o diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index bbf94bf2921e..c39ea3abeda4 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -31,6 +31,8 @@ #include "irq_user.h" #include "irq_kern.h" #include "os.h" +#include "sigio.h" +#include "misc_constants.h" /* * Generic, controller-independent functions: @@ -77,6 +79,298 @@ skip: return 0; } +struct irq_fd *active_fds = NULL; +static struct irq_fd **last_irq_ptr = &active_fds; + +extern void free_irqs(void); + +void sigio_handler(int sig, union uml_pt_regs *regs) +{ + struct irq_fd *irq_fd; + int n; + + if(smp_sigio_handler()) return; + while(1){ + n = os_waiting_for_events(active_fds); + if (n <= 0) { + if(n == -EINTR) continue; + else break; + } + + for(irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next){ + if(irq_fd->current_events != 0){ + irq_fd->current_events = 0; + do_IRQ(irq_fd->irq, regs); + } + } + } + + free_irqs(); +} + +static void maybe_sigio_broken(int fd, int type) +{ + if(os_isatty(fd)){ + if((type == IRQ_WRITE) && !pty_output_sigio){ + write_sigio_workaround(); + add_sigio_fd(fd, 0); + } + else if((type == IRQ_READ) && !pty_close_sigio){ + write_sigio_workaround(); + add_sigio_fd(fd, 1); + } + } +} + + +int activate_fd(int irq, int fd, int type, void *dev_id) +{ + struct pollfd *tmp_pfd; + struct irq_fd *new_fd, *irq_fd; + unsigned long flags; + int pid, events, err, n; + + pid = os_getpid(); + err = os_set_fd_async(fd, pid); + if(err < 0) + goto out; + + new_fd = um_kmalloc(sizeof(*new_fd)); + err = -ENOMEM; + if(new_fd == NULL) + goto out; + + if(type == IRQ_READ) events = UM_POLLIN | UM_POLLPRI; + else events = UM_POLLOUT; + *new_fd = ((struct irq_fd) { .next = NULL, + .id = dev_id, + .fd = fd, + .type = type, + .irq = irq, + .pid = pid, + .events = events, + .current_events = 0 } ); + + /* Critical section - locked by a spinlock because this stuff can + * be changed from interrupt handlers. The stuff above is done + * outside the lock because it allocates memory. + */ + + /* Actually, it only looks like it can be called from interrupt + * context. The culprit is reactivate_fd, which calls + * maybe_sigio_broken, which calls write_sigio_workaround, + * which calls activate_fd. However, write_sigio_workaround should + * only be called once, at boot time. That would make it clear that + * this is called only from process context, and can be locked with + * a semaphore. + */ + flags = irq_lock(); + for(irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next){ + if((irq_fd->fd == fd) && (irq_fd->type == type)){ + printk("Registering fd %d twice\n", fd); + printk("Irqs : %d, %d\n", irq_fd->irq, irq); + printk("Ids : 0x%p, 0x%p\n", irq_fd->id, dev_id); + goto out_unlock; + } + } + + /*-------------*/ + if(type == IRQ_WRITE) + fd = -1; + + tmp_pfd = NULL; + n = 0; + + while(1){ + n = os_create_pollfd(fd, events, tmp_pfd, n); + if (n == 0) + break; + + /* n > 0 + * It means we couldn't put new pollfd to current pollfds + * and tmp_fds is NULL or too small for new pollfds array. + * Needed size is equal to n as minimum. + * + * Here we have to drop the lock in order to call + * kmalloc, which might sleep. + * If something else came in and changed the pollfds array + * so we will not be able to put new pollfd struct to pollfds + * then we free the buffer tmp_fds and try again. + */ + irq_unlock(flags); + if (tmp_pfd != NULL) { + kfree(tmp_pfd); + tmp_pfd = NULL; + } + + tmp_pfd = um_kmalloc(n); + if (tmp_pfd == NULL) + goto out_kfree; + + flags = irq_lock(); + } + /*-------------*/ + + *last_irq_ptr = new_fd; + last_irq_ptr = &new_fd->next; + + irq_unlock(flags); + + /* This calls activate_fd, so it has to be outside the critical + * section. + */ + maybe_sigio_broken(fd, type); + + return(0); + + out_unlock: + irq_unlock(flags); + out_kfree: + kfree(new_fd); + out: + return(err); +} + +static void free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg) +{ + unsigned long flags; + + flags = irq_lock(); + os_free_irq_by_cb(test, arg, active_fds, &last_irq_ptr); + irq_unlock(flags); +} + +struct irq_and_dev { + int irq; + void *dev; +}; + +static int same_irq_and_dev(struct irq_fd *irq, void *d) +{ + struct irq_and_dev *data = d; + + return((irq->irq == data->irq) && (irq->id == data->dev)); +} + +void free_irq_by_irq_and_dev(unsigned int irq, void *dev) +{ + struct irq_and_dev data = ((struct irq_and_dev) { .irq = irq, + .dev = dev }); + + free_irq_by_cb(same_irq_and_dev, &data); +} + +static int same_fd(struct irq_fd *irq, void *fd) +{ + return(irq->fd == *((int *) fd)); +} + +void free_irq_by_fd(int fd) +{ + free_irq_by_cb(same_fd, &fd); +} + +static struct irq_fd *find_irq_by_fd(int fd, int irqnum, int *index_out) +{ + struct irq_fd *irq; + int i = 0; + int fdi; + + for(irq=active_fds; irq != NULL; irq = irq->next){ + if((irq->fd == fd) && (irq->irq == irqnum)) break; + i++; + } + if(irq == NULL){ + printk("find_irq_by_fd doesn't have descriptor %d\n", fd); + goto out; + } + fdi = os_get_pollfd(i); + if((fdi != -1) && (fdi != fd)){ + printk("find_irq_by_fd - mismatch between active_fds and " + "pollfds, fd %d vs %d, need %d\n", irq->fd, + fdi, fd); + irq = NULL; + goto out; + } + *index_out = i; + out: + return(irq); +} + +void reactivate_fd(int fd, int irqnum) +{ + struct irq_fd *irq; + unsigned long flags; + int i; + + flags = irq_lock(); + irq = find_irq_by_fd(fd, irqnum, &i); + if(irq == NULL){ + irq_unlock(flags); + return; + } + os_set_pollfd(i, irq->fd); + irq_unlock(flags); + + /* This calls activate_fd, so it has to be outside the critical + * section. + */ + maybe_sigio_broken(fd, irq->type); +} + +void deactivate_fd(int fd, int irqnum) +{ + struct irq_fd *irq; + unsigned long flags; + int i; + + flags = irq_lock(); + irq = find_irq_by_fd(fd, irqnum, &i); + if(irq == NULL) + goto out; + os_set_pollfd(i, -1); + out: + irq_unlock(flags); +} + +int deactivate_all_fds(void) +{ + struct irq_fd *irq; + int err; + + for(irq=active_fds;irq != NULL;irq = irq->next){ + err = os_clear_fd_async(irq->fd); + if(err) + return(err); + } + /* If there is a signal already queued, after unblocking ignore it */ + os_set_ioignore(); + + return(0); +} + +void forward_interrupts(int pid) +{ + struct irq_fd *irq; + unsigned long flags; + int err; + + flags = irq_lock(); + for(irq=active_fds;irq != NULL;irq = irq->next){ + err = os_set_owner(irq->fd, pid); + if(err < 0){ + /* XXX Just remove the irq rather than + * print out an infinite stream of these + */ + printk("Failed to forward %d to pid %d, err = %d\n", + irq->fd, pid, -err); + } + + irq->pid = pid; + } + irq_unlock(flags); +} + /* * do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific diff --git a/arch/um/kernel/irq_user.c b/arch/um/kernel/irq_user.c deleted file mode 100644 index 483e79c5ab98..000000000000 --- a/arch/um/kernel/irq_user.c +++ /dev/null @@ -1,324 +0,0 @@ -/* - * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "user_util.h" -#include "kern_util.h" -#include "user.h" -#include "process.h" -#include "sigio.h" -#include "irq_user.h" -#include "os.h" -#include "misc_constants.h" - -struct irq_fd *active_fds = NULL; -static struct irq_fd **last_irq_ptr = &active_fds; - -extern void free_irqs(void); - -void sigio_handler(int sig, union uml_pt_regs *regs) -{ - struct irq_fd *irq_fd; - int n; - - if(smp_sigio_handler()) return; - while(1){ - n = os_waiting_for_events(active_fds); - if (n <= 0) { - if(n == -EINTR) continue; - else break; - } - - for(irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next){ - if(irq_fd->current_events != 0){ - irq_fd->current_events = 0; - do_IRQ(irq_fd->irq, regs); - } - } - } - - free_irqs(); -} - -static void maybe_sigio_broken(int fd, int type) -{ - if(os_isatty(fd)){ - if((type == IRQ_WRITE) && !pty_output_sigio){ - write_sigio_workaround(); - add_sigio_fd(fd, 0); - } - else if((type == IRQ_READ) && !pty_close_sigio){ - write_sigio_workaround(); - add_sigio_fd(fd, 1); - } - } -} - - -int activate_fd(int irq, int fd, int type, void *dev_id) -{ - struct pollfd *tmp_pfd; - struct irq_fd *new_fd, *irq_fd; - unsigned long flags; - int pid, events, err, n; - - pid = os_getpid(); - err = os_set_fd_async(fd, pid); - if(err < 0) - goto out; - - new_fd = um_kmalloc(sizeof(*new_fd)); - err = -ENOMEM; - if(new_fd == NULL) - goto out; - - if(type == IRQ_READ) events = UM_POLLIN | UM_POLLPRI; - else events = UM_POLLOUT; - *new_fd = ((struct irq_fd) { .next = NULL, - .id = dev_id, - .fd = fd, - .type = type, - .irq = irq, - .pid = pid, - .events = events, - .current_events = 0 } ); - - /* Critical section - locked by a spinlock because this stuff can - * be changed from interrupt handlers. The stuff above is done - * outside the lock because it allocates memory. - */ - - /* Actually, it only looks like it can be called from interrupt - * context. The culprit is reactivate_fd, which calls - * maybe_sigio_broken, which calls write_sigio_workaround, - * which calls activate_fd. However, write_sigio_workaround should - * only be called once, at boot time. That would make it clear that - * this is called only from process context, and can be locked with - * a semaphore. - */ - flags = irq_lock(); - for(irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next){ - if((irq_fd->fd == fd) && (irq_fd->type == type)){ - printk("Registering fd %d twice\n", fd); - printk("Irqs : %d, %d\n", irq_fd->irq, irq); - printk("Ids : 0x%x, 0x%x\n", irq_fd->id, dev_id); - goto out_unlock; - } - } - - /*-------------*/ - if(type == IRQ_WRITE) - fd = -1; - - tmp_pfd = NULL; - n = 0; - - while(1){ - n = os_create_pollfd(fd, events, tmp_pfd, n); - if (n == 0) - break; - - /* n > 0 - * It means we couldn't put new pollfd to current pollfds - * and tmp_fds is NULL or too small for new pollfds array. - * Needed size is equal to n as minimum. - * - * Here we have to drop the lock in order to call - * kmalloc, which might sleep. - * If something else came in and changed the pollfds array - * so we will not be able to put new pollfd struct to pollfds - * then we free the buffer tmp_fds and try again. - */ - irq_unlock(flags); - if (tmp_pfd != NULL) { - kfree(tmp_pfd); - tmp_pfd = NULL; - } - - tmp_pfd = um_kmalloc(n); - if (tmp_pfd == NULL) - goto out_kfree; - - flags = irq_lock(); - } - /*-------------*/ - - *last_irq_ptr = new_fd; - last_irq_ptr = &new_fd->next; - - irq_unlock(flags); - - /* This calls activate_fd, so it has to be outside the critical - * section. - */ - maybe_sigio_broken(fd, type); - - return(0); - - out_unlock: - irq_unlock(flags); - out_kfree: - kfree(new_fd); - out: - return(err); -} - -static void free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg) -{ - unsigned long flags; - - flags = irq_lock(); - os_free_irq_by_cb(test, arg, active_fds, &last_irq_ptr); - irq_unlock(flags); -} - -struct irq_and_dev { - int irq; - void *dev; -}; - -static int same_irq_and_dev(struct irq_fd *irq, void *d) -{ - struct irq_and_dev *data = d; - - return((irq->irq == data->irq) && (irq->id == data->dev)); -} - -void free_irq_by_irq_and_dev(unsigned int irq, void *dev) -{ - struct irq_and_dev data = ((struct irq_and_dev) { .irq = irq, - .dev = dev }); - - free_irq_by_cb(same_irq_and_dev, &data); -} - -static int same_fd(struct irq_fd *irq, void *fd) -{ - return(irq->fd == *((int *) fd)); -} - -void free_irq_by_fd(int fd) -{ - free_irq_by_cb(same_fd, &fd); -} - -static struct irq_fd *find_irq_by_fd(int fd, int irqnum, int *index_out) -{ - struct irq_fd *irq; - int i = 0; - int fdi; - - for(irq=active_fds; irq != NULL; irq = irq->next){ - if((irq->fd == fd) && (irq->irq == irqnum)) break; - i++; - } - if(irq == NULL){ - printk("find_irq_by_fd doesn't have descriptor %d\n", fd); - goto out; - } - fdi = os_get_pollfd(i); - if((fdi != -1) && (fdi != fd)){ - printk("find_irq_by_fd - mismatch between active_fds and " - "pollfds, fd %d vs %d, need %d\n", irq->fd, - fdi, fd); - irq = NULL; - goto out; - } - *index_out = i; - out: - return(irq); -} - -void reactivate_fd(int fd, int irqnum) -{ - struct irq_fd *irq; - unsigned long flags; - int i; - - flags = irq_lock(); - irq = find_irq_by_fd(fd, irqnum, &i); - if(irq == NULL){ - irq_unlock(flags); - return; - } - os_set_pollfd(i, irq->fd); - irq_unlock(flags); - - /* This calls activate_fd, so it has to be outside the critical - * section. - */ - maybe_sigio_broken(fd, irq->type); -} - -void deactivate_fd(int fd, int irqnum) -{ - struct irq_fd *irq; - unsigned long flags; - int i; - - flags = irq_lock(); - irq = find_irq_by_fd(fd, irqnum, &i); - if(irq == NULL) - goto out; - os_set_pollfd(i, -1); - out: - irq_unlock(flags); -} - -int deactivate_all_fds(void) -{ - struct irq_fd *irq; - int err; - - for(irq=active_fds;irq != NULL;irq = irq->next){ - err = os_clear_fd_async(irq->fd); - if(err) - return(err); - } - /* If there is a signal already queued, after unblocking ignore it */ - os_set_ioignore(); - - return(0); -} - -void forward_interrupts(int pid) -{ - struct irq_fd *irq; - unsigned long flags; - int err; - - flags = irq_lock(); - for(irq=active_fds;irq != NULL;irq = irq->next){ - err = os_set_owner(irq->fd, pid); - if(err < 0){ - /* XXX Just remove the irq rather than - * print out an infinite stream of these - */ - printk("Failed to forward %d to pid %d, err = %d\n", - irq->fd, pid, -err); - } - - irq->pid = pid; - } - irq_unlock(flags); -} - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ From 8e367065eea04a6fde5cb8f3854164af99c45693 Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 27 Mar 2006 01:14:32 -0800 Subject: [PATCH 035/306] [PATCH] uml: move SIGIO startup code to os-Linux/start_up.c The serial UML OS-abstraction layer patch (um/kernel dir). This moves all startup code from sigio_user.c file under os-Linux dir Signed-off-by: Gennady Sharapov Signed-off-by: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/include/os.h | 1 + arch/um/include/sigio.h | 1 - arch/um/kernel/sigio_kern.c | 10 ++- arch/um/kernel/sigio_user.c | 162 +++--------------------------------- arch/um/kernel/um_arch.c | 3 +- arch/um/os-Linux/start_up.c | 128 ++++++++++++++++++++++++++++ 6 files changed, 148 insertions(+), 157 deletions(-) diff --git a/arch/um/include/os.h b/arch/um/include/os.h index efae3e6acb5b..6d865d4eade0 100644 --- a/arch/um/include/os.h +++ b/arch/um/include/os.h @@ -161,6 +161,7 @@ extern int os_lock_file(int fd, int excl); /* start_up.c */ extern void os_early_checks(void); extern int can_do_skas(void); +extern void os_check_bugs(void); /* Make sure they are clear when running in TT mode. Required by * SEGV_MAYBE_FIXABLE */ diff --git a/arch/um/include/sigio.h b/arch/um/include/sigio.h index 37d76e29a147..1432fcfb3212 100644 --- a/arch/um/include/sigio.h +++ b/arch/um/include/sigio.h @@ -8,7 +8,6 @@ extern int write_sigio_irq(int fd); extern int register_sigio_fd(int fd); -extern int read_sigio_fd(int fd); extern int add_sigio_fd(int fd, int read); extern int ignore_sigio_fd(int fd); extern void sigio_lock(void); diff --git a/arch/um/kernel/sigio_kern.c b/arch/um/kernel/sigio_kern.c index 229988463c4c..1c1300fb1e95 100644 --- a/arch/um/kernel/sigio_kern.c +++ b/arch/um/kernel/sigio_kern.c @@ -1,4 +1,4 @@ -/* +/* * Copyright (C) 2002 - 2003 Jeff Dike (jdike@addtoit.com) * Licensed under the GPL */ @@ -12,13 +12,16 @@ #include "sigio.h" #include "irq_user.h" #include "irq_kern.h" +#include "os.h" /* Protected by sigio_lock() called from write_sigio_workaround */ static int sigio_irq_fd = -1; static irqreturn_t sigio_interrupt(int irq, void *data, struct pt_regs *unused) { - read_sigio_fd(sigio_irq_fd); + char c; + + os_read_file(sigio_irq_fd, &c, sizeof(c)); reactivate_fd(sigio_irq_fd, SIGIO_WRITE_IRQ); return(IRQ_HANDLED); } @@ -51,6 +54,9 @@ void sigio_unlock(void) spin_unlock(&sigio_spinlock); } +extern void sigio_cleanup(void); +__uml_exitcall(sigio_cleanup); + /* * Overrides for Emacs so that we follow Linus's tabbing style. * Emacs will notice this stuff at the end of the file and automatically diff --git a/arch/um/kernel/sigio_user.c b/arch/um/kernel/sigio_user.c index f7b18e157d35..9bef9413cb5d 100644 --- a/arch/um/kernel/sigio_user.c +++ b/arch/um/kernel/sigio_user.c @@ -1,4 +1,4 @@ -/* +/* * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) * Licensed under the GPL */ @@ -20,127 +20,6 @@ #include "sigio.h" #include "os.h" -/* Changed during early boot */ -int pty_output_sigio = 0; -int pty_close_sigio = 0; - -/* Used as a flag during SIGIO testing early in boot */ -static volatile int got_sigio = 0; - -void __init handler(int sig) -{ - got_sigio = 1; -} - -struct openpty_arg { - int master; - int slave; - int err; -}; - -static void openpty_cb(void *arg) -{ - struct openpty_arg *info = arg; - - info->err = 0; - if(openpty(&info->master, &info->slave, NULL, NULL, NULL)) - info->err = -errno; -} - -void __init check_one_sigio(void (*proc)(int, int)) -{ - struct sigaction old, new; - struct openpty_arg pty = { .master = -1, .slave = -1 }; - int master, slave, err; - - initial_thread_cb(openpty_cb, &pty); - if(pty.err){ - printk("openpty failed, errno = %d\n", -pty.err); - return; - } - - master = pty.master; - slave = pty.slave; - - if((master == -1) || (slave == -1)){ - printk("openpty failed to allocate a pty\n"); - return; - } - - /* Not now, but complain so we now where we failed. */ - err = raw(master); - if (err < 0) - panic("check_sigio : __raw failed, errno = %d\n", -err); - - err = os_sigio_async(master, slave); - if(err < 0) - panic("tty_fds : sigio_async failed, err = %d\n", -err); - - if(sigaction(SIGIO, NULL, &old) < 0) - panic("check_sigio : sigaction 1 failed, errno = %d\n", errno); - new = old; - new.sa_handler = handler; - if(sigaction(SIGIO, &new, NULL) < 0) - panic("check_sigio : sigaction 2 failed, errno = %d\n", errno); - - got_sigio = 0; - (*proc)(master, slave); - - os_close_file(master); - os_close_file(slave); - - if(sigaction(SIGIO, &old, NULL) < 0) - panic("check_sigio : sigaction 3 failed, errno = %d\n", errno); -} - -static void tty_output(int master, int slave) -{ - int n; - char buf[512]; - - printk("Checking that host ptys support output SIGIO..."); - - memset(buf, 0, sizeof(buf)); - - while(os_write_file(master, buf, sizeof(buf)) > 0) ; - if(errno != EAGAIN) - panic("check_sigio : write failed, errno = %d\n", errno); - while(((n = os_read_file(slave, buf, sizeof(buf))) > 0) && !got_sigio) ; - - if (got_sigio) { - printk("Yes\n"); - pty_output_sigio = 1; - } else if (n == -EAGAIN) { - printk("No, enabling workaround\n"); - } else { - panic("check_sigio : read failed, err = %d\n", n); - } -} - -static void tty_close(int master, int slave) -{ - printk("Checking that host ptys support SIGIO on close..."); - - os_close_file(slave); - if(got_sigio){ - printk("Yes\n"); - pty_close_sigio = 1; - } - else printk("No, enabling workaround\n"); -} - -void __init check_sigio(void) -{ - if((os_access("/dev/ptmx", OS_ACC_R_OK) < 0) && - (os_access("/dev/ptyp0", OS_ACC_R_OK) < 0)){ - printk("No pseudo-terminals available - skipping pty SIGIO " - "check\n"); - return; - } - check_one_sigio(tty_output); - check_one_sigio(tty_close); -} - /* Protected by sigio_lock(), also used by sigio_cleanup, which is an * exitcall. */ @@ -267,10 +146,10 @@ static void update_thread(void) if(write_sigio_pid != -1) os_kill_process(write_sigio_pid, 1); write_sigio_pid = -1; - os_close_file(sigio_private[0]); - os_close_file(sigio_private[1]); - os_close_file(write_sigio_fds[0]); - os_close_file(write_sigio_fds[1]); + close(sigio_private[0]); + close(sigio_private[1]); + close(write_sigio_fds[0]); + close(write_sigio_fds[1]); /* Critical section end */ set_signals(flags); } @@ -428,39 +307,18 @@ void write_sigio_workaround(void) out_free: kfree(p); out_close2: - os_close_file(l_sigio_private[0]); - os_close_file(l_sigio_private[1]); + close(l_sigio_private[0]); + close(l_sigio_private[1]); out_close1: - os_close_file(l_write_sigio_fds[0]); - os_close_file(l_write_sigio_fds[1]); + close(l_write_sigio_fds[0]); + close(l_write_sigio_fds[1]); return; } -int read_sigio_fd(int fd) -{ - int n; - char c; - - n = os_read_file(fd, &c, sizeof(c)); - if(n != sizeof(c)){ - if(n < 0) { - printk("read_sigio_fd - read failed, err = %d\n", -n); - return(n); - } - else { - printk("read_sigio_fd - short read, bytes = %d\n", n); - return(-EIO); - } - } - return(n); -} - -static void sigio_cleanup(void) +void sigio_cleanup(void) { if (write_sigio_pid != -1) { os_kill_process(write_sigio_pid, 1); write_sigio_pid = -1; } } - -__uml_exitcall(sigio_cleanup); diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 0c5b9dcfbe2e..bb1c87211ac1 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -487,8 +487,7 @@ void __init setup_arch(char **cmdline_p) void __init check_bugs(void) { arch_check_bugs(); - check_sigio(); - check_devanon(); + os_check_bugs(); } void apply_alternatives(struct alt_instr *start, struct alt_instr *end) diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c index 829d6b0d8b02..32753131f8d8 100644 --- a/arch/um/os-Linux/start_up.c +++ b/arch/um/os-Linux/start_up.c @@ -3,6 +3,7 @@ * Licensed under the GPL */ +#include #include #include #include @@ -539,3 +540,130 @@ int __init parse_iomem(char *str, int *add) return(1); } + +/* Changed during early boot */ +int pty_output_sigio = 0; +int pty_close_sigio = 0; + +/* Used as a flag during SIGIO testing early in boot */ +static volatile int got_sigio = 0; + +static void __init handler(int sig) +{ + got_sigio = 1; +} + +struct openpty_arg { + int master; + int slave; + int err; +}; + +static void openpty_cb(void *arg) +{ + struct openpty_arg *info = arg; + + info->err = 0; + if(openpty(&info->master, &info->slave, NULL, NULL, NULL)) + info->err = -errno; +} + +static void __init check_one_sigio(void (*proc)(int, int)) +{ + struct sigaction old, new; + struct openpty_arg pty = { .master = -1, .slave = -1 }; + int master, slave, err; + + initial_thread_cb(openpty_cb, &pty); + if(pty.err){ + printk("openpty failed, errno = %d\n", -pty.err); + return; + } + + master = pty.master; + slave = pty.slave; + + if((master == -1) || (slave == -1)){ + printk("openpty failed to allocate a pty\n"); + return; + } + + /* Not now, but complain so we now where we failed. */ + err = raw(master); + if (err < 0) + panic("check_sigio : __raw failed, errno = %d\n", -err); + + err = os_sigio_async(master, slave); + if(err < 0) + panic("tty_fds : sigio_async failed, err = %d\n", -err); + + if(sigaction(SIGIO, NULL, &old) < 0) + panic("check_sigio : sigaction 1 failed, errno = %d\n", errno); + new = old; + new.sa_handler = handler; + if(sigaction(SIGIO, &new, NULL) < 0) + panic("check_sigio : sigaction 2 failed, errno = %d\n", errno); + + got_sigio = 0; + (*proc)(master, slave); + + close(master); + close(slave); + + if(sigaction(SIGIO, &old, NULL) < 0) + panic("check_sigio : sigaction 3 failed, errno = %d\n", errno); +} + +static void tty_output(int master, int slave) +{ + int n; + char buf[512]; + + printk("Checking that host ptys support output SIGIO..."); + + memset(buf, 0, sizeof(buf)); + + while(os_write_file(master, buf, sizeof(buf)) > 0) ; + if(errno != EAGAIN) + panic("check_sigio : write failed, errno = %d\n", errno); + while(((n = os_read_file(slave, buf, sizeof(buf))) > 0) && !got_sigio) ; + + if(got_sigio){ + printk("Yes\n"); + pty_output_sigio = 1; + } + else if(n == -EAGAIN) printk("No, enabling workaround\n"); + else panic("check_sigio : read failed, err = %d\n", n); +} + +static void tty_close(int master, int slave) +{ + printk("Checking that host ptys support SIGIO on close..."); + + close(slave); + if(got_sigio){ + printk("Yes\n"); + pty_close_sigio = 1; + } + else printk("No, enabling workaround\n"); +} + +void __init check_sigio(void) +{ + if((os_access("/dev/ptmx", OS_ACC_R_OK) < 0) && + (os_access("/dev/ptyp0", OS_ACC_R_OK) < 0)){ + printk("No pseudo-terminals available - skipping pty SIGIO " + "check\n"); + return; + } + check_one_sigio(tty_output); + check_one_sigio(tty_close); +} + +void os_check_bugs(void) +{ + check_ptrace(); + check_sigio(); + check_devanon(); +} + From f206aabb035318ac4bafbf0b87798335de3634df Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 27 Mar 2006 01:14:33 -0800 Subject: [PATCH 036/306] [PATCH] uml: move sigio_user.c to os-Linux/sigio.c The serial UML OS-abstraction layer patch (um/kernel dir). This moves sigio_user.c to os-Linux dir Signed-off-by: Gennady Sharapov Signed-off-by: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/include/os.h | 5 +++++ arch/um/include/sigio.h | 2 -- arch/um/include/user_util.h | 1 - arch/um/kernel/Makefile | 2 +- arch/um/os-Linux/Makefile | 9 +++++---- .../um/{kernel/sigio_user.c => os-Linux/sigio.c} | 16 ++++++++-------- 6 files changed, 19 insertions(+), 16 deletions(-) rename arch/um/{kernel/sigio_user.c => os-Linux/sigio.c} (96%) diff --git a/arch/um/include/os.h b/arch/um/include/os.h index 6d865d4eade0..53dee8d3cdcc 100644 --- a/arch/um/include/os.h +++ b/arch/um/include/os.h @@ -311,4 +311,9 @@ extern void os_set_pollfd(int i, int fd); extern void os_set_ioignore(void); extern void init_irq_signals(int on_sigstack); +/* sigio.c */ +extern void write_sigio_workaround(void); +extern int add_sigio_fd(int fd, int read); +extern int ignore_sigio_fd(int fd); + #endif diff --git a/arch/um/include/sigio.h b/arch/um/include/sigio.h index 1432fcfb3212..fe99ea163c2e 100644 --- a/arch/um/include/sigio.h +++ b/arch/um/include/sigio.h @@ -8,8 +8,6 @@ extern int write_sigio_irq(int fd); extern int register_sigio_fd(int fd); -extern int add_sigio_fd(int fd, int read); -extern int ignore_sigio_fd(int fd); extern void sigio_lock(void); extern void sigio_unlock(void); diff --git a/arch/um/include/user_util.h b/arch/um/include/user_util.h index a6f1f176cf84..992a7e1e0fca 100644 --- a/arch/um/include/user_util.h +++ b/arch/um/include/user_util.h @@ -58,7 +58,6 @@ extern int attach(int pid); extern void kill_child_dead(int pid); extern int cont(int pid); extern void check_sigio(void); -extern void write_sigio_workaround(void); extern void arch_check_bugs(void); extern int cpu_feature(char *what, char *buf, int len); extern int arch_handle_signal(int sig, union uml_pt_regs *regs); diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index 8458c30a6265..ac5afaa0306f 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -8,7 +8,7 @@ clean-files := obj-y = config.o exec_kern.o exitcode.o \ init_task.o irq.o ksyms.o mem.o physmem.o \ - process_kern.o ptrace.o reboot.o resource.o sigio_user.o sigio_kern.o \ + process_kern.o ptrace.o reboot.o resource.o sigio_kern.o \ signal_kern.o smp.o syscall_kern.o sysrq.o \ time_kern.o tlb.o trap_kern.o uaccess.o um_arch.o umid.o diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile index 73d3d837ed6b..c3d56c2935c2 100644 --- a/arch/um/os-Linux/Makefile +++ b/arch/um/os-Linux/Makefile @@ -3,14 +3,15 @@ # Licensed under the GPL # -obj-y = aio.o elf_aux.o file.o helper.o irq.o main.o mem.o process.o signal.o \ - start_up.o time.o trap.o tt.o tty.o uaccess.o umid.o user_syms.o \ - util.o drivers/ sys-$(SUBARCH)/ +obj-y = aio.o elf_aux.o file.o helper.o irq.o main.o mem.o process.o sigio.o \ + signal.o start_up.o time.o trap.o tt.o tty.o uaccess.o umid.o \ + user_syms.o util.o drivers/ sys-$(SUBARCH)/ obj-$(CONFIG_MODE_SKAS) += skas/ USER_OBJS := aio.o elf_aux.o file.o helper.o irq.o main.o mem.o process.o \ - signal.o start_up.o time.o trap.o tt.o tty.o uaccess.o umid.o util.o + sigio.o signal.o start_up.o time.o trap.o tt.o tty.o uaccess.o umid.o \ + util.o elf_aux.o: $(ARCH_DIR)/kernel-offsets.h CFLAGS_elf_aux.o += -I$(objtree)/arch/um diff --git a/arch/um/kernel/sigio_user.c b/arch/um/os-Linux/sigio.c similarity index 96% rename from arch/um/kernel/sigio_user.c rename to arch/um/os-Linux/sigio.c index 9bef9413cb5d..7604c404c4c2 100644 --- a/arch/um/kernel/sigio_user.c +++ b/arch/um/os-Linux/sigio.c @@ -20,7 +20,7 @@ #include "sigio.h" #include "os.h" -/* Protected by sigio_lock(), also used by sigio_cleanup, which is an +/* Protected by sigio_lock(), also used by sigio_cleanup, which is an * exitcall. */ static int write_sigio_pid = -1; @@ -143,7 +143,7 @@ static void update_thread(void) return; fail: /* Critical section start */ - if(write_sigio_pid != -1) + if(write_sigio_pid != -1) os_kill_process(write_sigio_pid, 1); write_sigio_pid = -1; close(sigio_private[0]); @@ -160,13 +160,13 @@ int add_sigio_fd(int fd, int read) sigio_lock(); for(i = 0; i < current_poll.used; i++){ - if(current_poll.poll[i].fd == fd) + if(current_poll.poll[i].fd == fd) goto out; } n = current_poll.used + 1; err = need_poll(n); - if(err) + if(err) goto out; for(i = 0; i < current_poll.used; i++) @@ -195,7 +195,7 @@ int ignore_sigio_fd(int fd) } if(i == current_poll.used) goto out; - + err = need_poll(current_poll.used - 1); if(err) goto out; @@ -216,7 +216,7 @@ int ignore_sigio_fd(int fd) return(err); } -static struct pollfd* setup_initial_poll(int fd) +static struct pollfd *setup_initial_poll(int fd) { struct pollfd *p; @@ -256,7 +256,7 @@ void write_sigio_workaround(void) } err = os_pipe(l_sigio_private, 1, 1); if(err < 0){ - printk("write_sigio_workaround - os_pipe 1 failed, " + printk("write_sigio_workaround - os_pipe 2 failed, " "err = %d\n", -err); goto out_close1; } @@ -317,7 +317,7 @@ void write_sigio_workaround(void) void sigio_cleanup(void) { - if (write_sigio_pid != -1) { + if(write_sigio_pid != -1){ os_kill_process(write_sigio_pid, 1); write_sigio_pid = -1; } From 81efcd3300754462537ffac60336158b2f773b4e Mon Sep 17 00:00:00 2001 From: Bodo Stroesser Date: Mon, 27 Mar 2006 01:14:34 -0800 Subject: [PATCH 037/306] [PATCH] uml: more carefully test whether we are in a system call For security reasons, UML in is_syscall() needs to have access to code in vsyscall-page. The current implementation grants this access by explicitly allowing access to vsyscall in access_ok_skas(). With this change, copy_from_user() may be used to read the code. Ptrace access to vsyscall-page for debugging already was implemented in get_user_pages() by mainline. In i386, copy_from_user can't access vsyscall-page, but returns EFAULT. To make UML behave as i386 does, I changed is_syscall to use access_process_vm(current) to read the code from vsyscall-page. This doesn't hurt security, but simplifies the code and prepares implementation of stub-vmas. Signed-off-by: Bodo Stroesser Signed-off-by: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/sys-i386/ptrace.c | 15 ++++++++++++--- arch/um/sys-x86_64/ptrace.c | 22 +++++++++++++++++++++- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/arch/um/sys-i386/ptrace.c b/arch/um/sys-i386/ptrace.c index e839ce65ad28..8032a105949a 100644 --- a/arch/um/sys-i386/ptrace.c +++ b/arch/um/sys-i386/ptrace.c @@ -6,6 +6,7 @@ #include #include #include "linux/sched.h" +#include "linux/mm.h" #include "asm/elf.h" #include "asm/ptrace.h" #include "asm/uaccess.h" @@ -26,9 +27,17 @@ int is_syscall(unsigned long addr) n = copy_from_user(&instr, (void __user *) addr, sizeof(instr)); if(n){ - printk("is_syscall : failed to read instruction from 0x%lx\n", - addr); - return(0); + /* access_process_vm() grants access to vsyscall and stub, + * while copy_from_user doesn't. Maybe access_process_vm is + * slow, but that doesn't matter, since it will be called only + * in case of singlestepping, if copy_from_user failed. + */ + n = access_process_vm(current, addr, &instr, sizeof(instr), 0); + if(n != sizeof(instr)) { + printk("is_syscall : failed to read instruction from " + "0x%lx\n", addr); + return(1); + } } /* int 0x80 or sysenter */ return((instr == 0x80cd) || (instr == 0x340f)); diff --git a/arch/um/sys-x86_64/ptrace.c b/arch/um/sys-x86_64/ptrace.c index 74eee5c7c6dd..147bbf05cbc2 100644 --- a/arch/um/sys-x86_64/ptrace.c +++ b/arch/um/sys-x86_64/ptrace.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -136,9 +137,28 @@ void arch_switch(void) */ } +/* XXX Mostly copied from sys-i386 */ int is_syscall(unsigned long addr) { - panic("is_syscall"); + unsigned short instr; + int n; + + n = copy_from_user(&instr, (void __user *) addr, sizeof(instr)); + if(n){ + /* access_process_vm() grants access to vsyscall and stub, + * while copy_from_user doesn't. Maybe access_process_vm is + * slow, but that doesn't matter, since it will be called only + * in case of singlestepping, if copy_from_user failed. + */ + n = access_process_vm(current, addr, &instr, sizeof(instr), 0); + if(n != sizeof(instr)) { + printk("is_syscall : failed to read instruction from " + "0x%lx\n", addr); + return(1); + } + } + /* sysenter */ + return(instr == 0x050f); } int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu ) From c554f899b6747c9d8035bf91864d89b337ceb411 Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 27 Mar 2006 01:14:36 -0800 Subject: [PATCH 038/306] [PATCH] uml: move tty logging to os-Linux The serial UML OS-abstraction layer patch (um/kernel dir). This moves all systemcalls from tty_log.c file under os-Linux dir Signed-off-by: Gennady Sharapov Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/kernel/Makefile | 5 +---- arch/um/kernel/exec_kern.c | 2 -- arch/um/os-Linux/Makefile | 8 +++++--- arch/um/{kernel => os-Linux}/tty_log.c | 18 +++--------------- 4 files changed, 9 insertions(+), 24 deletions(-) rename arch/um/{kernel => os-Linux}/tty_log.c (91%) diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index ac5afaa0306f..fe08971b64cf 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -15,15 +15,12 @@ obj-y = config.o exec_kern.o exitcode.o \ obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o obj-$(CONFIG_GPROF) += gprof_syms.o obj-$(CONFIG_GCOV) += gmon_syms.o -obj-$(CONFIG_TTY_LOG) += tty_log.o obj-$(CONFIG_SYSCALL_DEBUG) += syscall.o obj-$(CONFIG_MODE_TT) += tt/ obj-$(CONFIG_MODE_SKAS) += skas/ -user-objs-$(CONFIG_TTY_LOG) += tty_log.o - -USER_OBJS := $(user-objs-y) config.o tty_log.o +USER_OBJS := config.o include arch/um/scripts/Makefile.rules diff --git a/arch/um/kernel/exec_kern.c b/arch/um/kernel/exec_kern.c index c264e1c05ab3..1ca84319317d 100644 --- a/arch/um/kernel/exec_kern.c +++ b/arch/um/kernel/exec_kern.c @@ -30,8 +30,6 @@ void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp) CHOOSE_MODE_PROC(start_thread_tt, start_thread_skas, regs, eip, esp); } -extern void log_exec(char **argv, void *tty); - static long execve1(char *file, char __user * __user *argv, char __user *__user *env) { diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile index c3d56c2935c2..1659386b42bb 100644 --- a/arch/um/os-Linux/Makefile +++ b/arch/um/os-Linux/Makefile @@ -8,10 +8,12 @@ obj-y = aio.o elf_aux.o file.o helper.o irq.o main.o mem.o process.o sigio.o \ user_syms.o util.o drivers/ sys-$(SUBARCH)/ obj-$(CONFIG_MODE_SKAS) += skas/ +obj-$(CONFIG_TTY_LOG) += tty_log.o +user-objs-$(CONFIG_TTY_LOG) += tty_log.o -USER_OBJS := aio.o elf_aux.o file.o helper.o irq.o main.o mem.o process.o \ - sigio.o signal.o start_up.o time.o trap.o tt.o tty.o uaccess.o umid.o \ - util.o +USER_OBJS := $(user-objs-y) aio.o elf_aux.o file.o helper.o irq.o main.o mem.o \ + process.o sigio.o signal.o start_up.o time.o trap.o tt.o tty.o \ + uaccess.o umid.o util.o elf_aux.o: $(ARCH_DIR)/kernel-offsets.h CFLAGS_elf_aux.o += -I$(objtree)/arch/um diff --git a/arch/um/kernel/tty_log.c b/arch/um/os-Linux/tty_log.c similarity index 91% rename from arch/um/kernel/tty_log.c rename to arch/um/os-Linux/tty_log.c index 9ada656f68ce..c6ba56c1560f 100644 --- a/arch/um/kernel/tty_log.c +++ b/arch/um/os-Linux/tty_log.c @@ -1,5 +1,5 @@ -/* - * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) and +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) and * geoffrey hing * Licensed under the GPL */ @@ -58,7 +58,7 @@ int open_tty_log(void *tty, void *current_tty) return(tty_log_fd); } - sprintf(buf, "%s/%0u-%0u", tty_log_dir, (unsigned int) tv.tv_sec, + sprintf(buf, "%s/%0u-%0u", tty_log_dir, (unsigned int) tv.tv_sec, (unsigned int) tv.tv_usec); fd = os_open_file(buf, of_append(of_create(of_rdwr(OPENFLAGS()))), @@ -216,15 +216,3 @@ __uml_setup("tty_log_fd=", set_tty_log_fd, " tty data will be written. Preconfigure the descriptor with something\n" " like '10>tty_log tty_log_fd=10'.\n\n" ); - - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ From cf9165a50aa21027d2f0eefc90476ec59ae6d2b8 Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 27 Mar 2006 01:14:36 -0800 Subject: [PATCH 039/306] [PATCH] uml: oS header cleanups This rearranges the OS declarations by moving some declarations into os.h. Signed-off-by: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/include/os.h | 14 ++++++++++++++ arch/um/include/skas/mode-skas.h | 1 - arch/um/include/skas/skas.h | 1 - 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/um/include/os.h b/arch/um/include/os.h index 53dee8d3cdcc..d3d1bc6074ef 100644 --- a/arch/um/include/os.h +++ b/arch/um/include/os.h @@ -122,6 +122,7 @@ static inline struct openflags of_cloexec(struct openflags flags) return(flags); } +/* file.c */ extern int os_stat_file(const char *file_name, struct uml_stat *buf); extern int os_stat_fd(const int fd, struct uml_stat *buf); extern int os_access(const char *file, int mode); @@ -157,6 +158,15 @@ extern int os_connect_socket(char *name); extern int os_file_type(char *file); extern int os_file_mode(char *file, struct openflags *mode_out); extern int os_lock_file(int fd, int excl); +extern void os_flush_stdout(void); +extern int os_stat_filesystem(char *path, long *bsize_out, + long long *blocks_out, long long *bfree_out, + long long *bavail_out, long long *files_out, + long long *ffree_out, void *fsid_out, + int fsid_size, long *namelen_out, + long *spare_out); +extern int os_change_dir(char *dir); +extern int os_fchange_dir(int fd); /* start_up.c */ extern void os_early_checks(void); @@ -316,4 +326,8 @@ extern void write_sigio_workaround(void); extern int add_sigio_fd(int fd, int read); extern int ignore_sigio_fd(int fd); +/* skas/trap */ +extern void sig_handler_common_skas(int sig, void *sc_ptr); +extern void user_signal(int sig, union uml_pt_regs *regs, int pid); + #endif diff --git a/arch/um/include/skas/mode-skas.h b/arch/um/include/skas/mode-skas.h index 260065cfeef1..8bc6916bbbb1 100644 --- a/arch/um/include/skas/mode-skas.h +++ b/arch/um/include/skas/mode-skas.h @@ -13,7 +13,6 @@ extern unsigned long exec_fp_regs[]; extern unsigned long exec_fpx_regs[]; extern int have_fpx_regs; -extern void sig_handler_common_skas(int sig, void *sc_ptr); extern void kill_off_processes_skas(void); #endif diff --git a/arch/um/include/skas/skas.h b/arch/um/include/skas/skas.h index 86357282d681..853b26f148c5 100644 --- a/arch/um/include/skas/skas.h +++ b/arch/um/include/skas/skas.h @@ -17,7 +17,6 @@ extern int user_thread(unsigned long stack, int flags); extern void new_thread_proc(void *stack, void (*handler)(int sig)); extern void new_thread_handler(int sig); extern void handle_syscall(union uml_pt_regs *regs); -extern void user_signal(int sig, union uml_pt_regs *regs, int pid); extern int new_mm(unsigned long stack); extern void get_skas_faultinfo(int pid, struct faultinfo * fi); extern long execute_syscall_skas(void *r); From 6c29256c5703ad7c3cf62276dbc38d802a05669e Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 27 Mar 2006 01:14:37 -0800 Subject: [PATCH 040/306] [PATCH] uml: allow ubd devices to be shared in a cluster This adds a 'c' option to the ubd switch which turns off host file locking so that the device can be shared, as with a cluster. There's also some whitespace cleanup while I was in this file. Signed-off-by: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/drivers/ubd_kern.c | 76 +++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 33 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index fa617e0719ab..0336575d2448 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -1,4 +1,4 @@ -/* +/* * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) * Licensed under the GPL */ @@ -71,7 +71,7 @@ struct io_thread_req { int error; }; -extern int open_ubd_file(char *file, struct openflags *openflags, +extern int open_ubd_file(char *file, struct openflags *openflags, int shared, char **backing_file_out, int *bitmap_offset_out, unsigned long *bitmap_len_out, int *data_offset_out, int *create_cow_out); @@ -137,7 +137,7 @@ static int fake_major = MAJOR_NR; static struct gendisk *ubd_gendisk[MAX_DEV]; static struct gendisk *fake_gendisk[MAX_DEV]; - + #ifdef CONFIG_BLK_DEV_UBD_SYNC #define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 1, .c = 0, \ .cl = 1 }) @@ -168,6 +168,7 @@ struct ubd { __u64 size; struct openflags boot_openflags; struct openflags openflags; + int shared; int no_cow; struct cow cow; struct platform_device pdev; @@ -189,6 +190,7 @@ struct ubd { .boot_openflags = OPEN_FLAGS, \ .openflags = OPEN_FLAGS, \ .no_cow = 0, \ + .shared = 0, \ .cow = DEFAULT_COW, \ } @@ -305,7 +307,7 @@ static int ubd_setup_common(char *str, int *index_out) } major = simple_strtoul(str, &end, 0); if((*end != '\0') || (end == str)){ - printk(KERN_ERR + printk(KERN_ERR "ubd_setup : didn't parse major number\n"); return(1); } @@ -316,7 +318,7 @@ static int ubd_setup_common(char *str, int *index_out) printk(KERN_ERR "Can't assign a fake major twice\n"); goto out1; } - + fake_major = major; printk(KERN_INFO "Setting extra ubd major number to %d\n", @@ -351,7 +353,7 @@ static int ubd_setup_common(char *str, int *index_out) if (index_out) *index_out = n; - for (i = 0; i < 4; i++) { + for (i = 0; i < sizeof("rscd="); i++) { switch (*str) { case 'r': flags.w = 0; @@ -362,11 +364,14 @@ static int ubd_setup_common(char *str, int *index_out) case 'd': dev->no_cow = 1; break; + case 'c': + dev->shared = 1; + break; case '=': str++; goto break_loop; default: - printk(KERN_ERR "ubd_setup : Expected '=' or flag letter (r,s or d)\n"); + printk(KERN_ERR "ubd_setup : Expected '=' or flag letter (r, s, c, or d)\n"); goto out; } str++; @@ -515,7 +520,7 @@ static void ubd_handler(void) spin_unlock(&ubd_io_lock); return; } - + ubd_finish(rq, req.error); reactivate_fd(thread_fd, UBD_IRQ); do_ubd_request(ubd_queue); @@ -532,7 +537,7 @@ static int io_pid = -1; void kill_io_thread(void) { - if(io_pid != -1) + if(io_pid != -1) os_kill_process(io_pid, 1); } @@ -567,14 +572,15 @@ static int ubd_open_dev(struct ubd *dev) create_cow = 0; create_ptr = (dev->cow.file != NULL) ? &create_cow : NULL; back_ptr = dev->no_cow ? NULL : &dev->cow.file; - dev->fd = open_ubd_file(dev->file, &dev->openflags, back_ptr, - &dev->cow.bitmap_offset, &dev->cow.bitmap_len, - &dev->cow.data_offset, create_ptr); + dev->fd = open_ubd_file(dev->file, &dev->openflags, dev->shared, + back_ptr, &dev->cow.bitmap_offset, + &dev->cow.bitmap_len, &dev->cow.data_offset, + create_ptr); if((dev->fd == -ENOENT) && create_cow){ - dev->fd = create_cow_file(dev->file, dev->cow.file, + dev->fd = create_cow_file(dev->file, dev->cow.file, dev->openflags, 1 << 9, PAGE_SIZE, - &dev->cow.bitmap_offset, + &dev->cow.bitmap_offset, &dev->cow.bitmap_len, &dev->cow.data_offset); if(dev->fd >= 0){ @@ -598,16 +604,16 @@ static int ubd_open_dev(struct ubd *dev) } flush_tlb_kernel_vm(); - err = read_cow_bitmap(dev->fd, dev->cow.bitmap, - dev->cow.bitmap_offset, + err = read_cow_bitmap(dev->fd, dev->cow.bitmap, + dev->cow.bitmap_offset, dev->cow.bitmap_len); if(err < 0) goto error; flags = dev->openflags; flags.w = 0; - err = open_ubd_file(dev->cow.file, &flags, NULL, NULL, NULL, - NULL, NULL); + err = open_ubd_file(dev->cow.file, &flags, dev->shared, NULL, + NULL, NULL, NULL, NULL); if(err < 0) goto error; dev->cow.fd = err; } @@ -685,11 +691,11 @@ static int ubd_add(int n) dev->size = ROUND_BLOCK(dev->size); err = ubd_new_disk(MAJOR_NR, dev->size, n, &ubd_gendisk[n]); - if(err) + if(err) goto out_close; - + if(fake_major != MAJOR_NR) - ubd_new_disk(fake_major, dev->size, n, + ubd_new_disk(fake_major, dev->size, n, &fake_gendisk[n]); /* perhaps this should also be under the "if (fake_major)" above */ @@ -854,7 +860,7 @@ int ubd_init(void) return -1; } platform_driver_register(&ubd_driver); - for (i = 0; i < MAX_DEV; i++) + for (i = 0; i < MAX_DEV; i++) ubd_add(i); return 0; } @@ -872,16 +878,16 @@ int ubd_driver_init(void){ * enough. So use anyway the io thread. */ } stack = alloc_stack(0, 0); - io_pid = start_io_thread(stack + PAGE_SIZE - sizeof(void *), + io_pid = start_io_thread(stack + PAGE_SIZE - sizeof(void *), &thread_fd); if(io_pid < 0){ - printk(KERN_ERR + printk(KERN_ERR "ubd : Failed to start I/O thread (errno = %d) - " "falling back to synchronous I/O\n", -io_pid); io_pid = -1; return(0); } - err = um_request_irq(UBD_IRQ, thread_fd, IRQ_READ, ubd_intr, + err = um_request_irq(UBD_IRQ, thread_fd, IRQ_READ, ubd_intr, SA_INTERRUPT, "ubd", ubd_dev); if(err != 0) printk(KERN_ERR "um_request_irq failed - errno = %d\n", -err); @@ -978,7 +984,7 @@ static void cowify_req(struct io_thread_req *req, unsigned long *bitmap, if(req->op == UBD_READ) { for(i = 0; i < req->length >> 9; i++){ if(ubd_test_bit(sector + i, (unsigned char *) bitmap)) - ubd_set_bit(i, (unsigned char *) + ubd_set_bit(i, (unsigned char *) &req->sector_mask); } } @@ -999,7 +1005,7 @@ static int prepare_request(struct request *req, struct io_thread_req *io_req) /* This should be impossible now */ if((rq_data_dir(req) == WRITE) && !dev->openflags.w){ - printk("Write attempted on readonly ubd device %s\n", + printk("Write attempted on readonly ubd device %s\n", disk->disk_name); end_request(req, 0); return(1); @@ -1182,7 +1188,7 @@ int read_cow_bitmap(int fd, void *buf, int offset, int len) return(0); } -int open_ubd_file(char *file, struct openflags *openflags, +int open_ubd_file(char *file, struct openflags *openflags, int shared, char **backing_file_out, int *bitmap_offset_out, unsigned long *bitmap_len_out, int *data_offset_out, int *create_cow_out) @@ -1206,10 +1212,14 @@ int open_ubd_file(char *file, struct openflags *openflags, return fd; } - err = os_lock_file(fd, openflags->w); - if(err < 0){ - printk("Failed to lock '%s', err = %d\n", file, -err); - goto out_close; + if(shared) + printk("Not locking \"%s\" on the host\n", file); + else { + err = os_lock_file(fd, openflags->w); + if(err < 0){ + printk("Failed to lock '%s', err = %d\n", file, -err); + goto out_close; + } } /* Succesful return case! */ @@ -1260,7 +1270,7 @@ int create_cow_file(char *cow_file, char *backing_file, struct openflags flags, int err, fd; flags.c = 1; - fd = open_ubd_file(cow_file, &flags, NULL, NULL, NULL, NULL, NULL); + fd = open_ubd_file(cow_file, &flags, 0, NULL, NULL, NULL, NULL, NULL); if(fd < 0){ err = fd; printk("Open of COW file '%s' failed, errno = %d\n", cow_file, From 98c18238f146a9038098244d60a7d2b1f67ee790 Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 27 Mar 2006 01:14:38 -0800 Subject: [PATCH 041/306] [PATCH] uml: fix segfault on signal delivery This fixes a process segfault where a signal was being delivered such that a new stack page needed to be allocated to hold the signal frame. This was tripping some logic in the page fault handler which wouldn't allocate the page if the faulting address was more that 32 bytes lower than the current stack pointer. Since a signal frame is greater than 32 bytes, this exercised that case. It's fixed by updating the SP in the pt_regs before starting to copy the signal frame. Since those are the registers that will be copied on to the stack, we have to be careful to put the original SP, not the new one which points to the signal frame, on the stack. Signed-off-by: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/sys-i386/signal.c | 59 +++++++++++++++++++++++++++---------- arch/um/sys-x86_64/signal.c | 56 +++++++++++++++++++++++++---------- 2 files changed, 85 insertions(+), 30 deletions(-) diff --git a/arch/um/sys-i386/signal.c b/arch/um/sys-i386/signal.c index 7cd1a82dc8c2..33a40f5ef0d2 100644 --- a/arch/um/sys-i386/signal.c +++ b/arch/um/sys-i386/signal.c @@ -58,7 +58,7 @@ static int copy_sc_from_user_skas(struct pt_regs *regs, } int copy_sc_to_user_skas(struct sigcontext *to, struct _fpstate *to_fp, - struct pt_regs *regs) + struct pt_regs *regs, unsigned long sp) { struct sigcontext sc; unsigned long fpregs[HOST_FP_SIZE]; @@ -72,7 +72,7 @@ int copy_sc_to_user_skas(struct sigcontext *to, struct _fpstate *to_fp, sc.edi = REGS_EDI(regs->regs.skas.regs); sc.esi = REGS_ESI(regs->regs.skas.regs); sc.ebp = REGS_EBP(regs->regs.skas.regs); - sc.esp = REGS_SP(regs->regs.skas.regs); + sc.esp = sp; sc.ebx = REGS_EBX(regs->regs.skas.regs); sc.edx = REGS_EDX(regs->regs.skas.regs); sc.ecx = REGS_ECX(regs->regs.skas.regs); @@ -132,7 +132,7 @@ int copy_sc_from_user_tt(struct sigcontext *to, struct sigcontext *from, } int copy_sc_to_user_tt(struct sigcontext *to, struct _fpstate *fp, - struct sigcontext *from, int fpsize) + struct sigcontext *from, int fpsize, unsigned long sp) { struct _fpstate *to_fp, *from_fp; int err; @@ -140,11 +140,18 @@ int copy_sc_to_user_tt(struct sigcontext *to, struct _fpstate *fp, to_fp = (fp ? fp : (struct _fpstate *) (to + 1)); from_fp = from->fpstate; err = copy_to_user(to, from, sizeof(*to)); + + /* The SP in the sigcontext is the updated one for the signal + * delivery. The sp passed in is the original, and this needs + * to be restored, so we stick it in separately. + */ + err |= copy_to_user(&SC_SP(to), sp, sizeof(sp)); + if(from_fp != NULL){ err |= copy_to_user(&to->fpstate, &to_fp, sizeof(to->fpstate)); err |= copy_to_user(to_fp, from_fp, fpsize); } - return(err); + return err; } #endif @@ -159,11 +166,11 @@ static int copy_sc_from_user(struct pt_regs *to, void __user *from) } static int copy_sc_to_user(struct sigcontext *to, struct _fpstate *fp, - struct pt_regs *from) + struct pt_regs *from, unsigned long sp) { return(CHOOSE_MODE(copy_sc_to_user_tt(to, fp, UPT_SC(&from->regs), - sizeof(*fp)), - copy_sc_to_user_skas(to, fp, from))); + sizeof(*fp), sp), + copy_sc_to_user_skas(to, fp, from, sp))); } static int copy_ucontext_to_user(struct ucontext *uc, struct _fpstate *fp, @@ -174,7 +181,7 @@ static int copy_ucontext_to_user(struct ucontext *uc, struct _fpstate *fp, err |= put_user(current->sas_ss_sp, &uc->uc_stack.ss_sp); err |= put_user(sas_ss_flags(sp), &uc->uc_stack.ss_flags); err |= put_user(current->sas_ss_size, &uc->uc_stack.ss_size); - err |= copy_sc_to_user(&uc->uc_mcontext, fp, ¤t->thread.regs); + err |= copy_sc_to_user(&uc->uc_mcontext, fp, ¤t->thread.regs, sp); err |= copy_to_user(&uc->uc_sigmask, set, sizeof(*set)); return(err); } @@ -207,6 +214,7 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig, { struct sigframe __user *frame; void *restorer; + unsigned long save_sp = PT_REGS_SP(regs); int err = 0; stack_top &= -8UL; @@ -218,9 +226,19 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig, if(ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; + /* Update SP now because the page fault handler refuses to extend + * the stack if the faulting address is too far below the current + * SP, which frame now certainly is. If there's an error, the original + * value is restored on the way out. + * When writing the sigcontext to the stack, we have to write the + * original value, so that's passed to copy_sc_to_user, which does + * the right thing with it. + */ + PT_REGS_SP(regs) = (unsigned long) frame; + err |= __put_user(restorer, &frame->pretcode); err |= __put_user(sig, &frame->sig); - err |= copy_sc_to_user(&frame->sc, NULL, regs); + err |= copy_sc_to_user(&frame->sc, NULL, regs, save_sp); err |= __put_user(mask->sig[0], &frame->sc.oldmask); if (_NSIG_WORDS > 1) err |= __copy_to_user(&frame->extramask, &mask->sig[1], @@ -238,7 +256,7 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig, err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); if(err) - return(err); + goto err; PT_REGS_SP(regs) = (unsigned long) frame; PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; @@ -248,7 +266,11 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig, if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) ptrace_notify(SIGTRAP); - return(0); + return 0; + +err: + PT_REGS_SP(regs) = save_sp; + return err; } int setup_signal_stack_si(unsigned long stack_top, int sig, @@ -257,6 +279,7 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, { struct rt_sigframe __user *frame; void *restorer; + unsigned long save_sp = PT_REGS_SP(regs); int err = 0; stack_top &= -8UL; @@ -268,13 +291,16 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, if(ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; + /* See comment above about why this is here */ + PT_REGS_SP(regs) = (unsigned long) frame; + err |= __put_user(restorer, &frame->pretcode); err |= __put_user(sig, &frame->sig); err |= __put_user(&frame->info, &frame->pinfo); err |= __put_user(&frame->uc, &frame->puc); err |= copy_siginfo_to_user(&frame->info, info); err |= copy_ucontext_to_user(&frame->uc, &frame->fpstate, mask, - PT_REGS_SP(regs)); + save_sp); /* * This is movl $,%eax ; int $0x80 @@ -288,9 +314,8 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); if(err) - return(err); + goto err; - PT_REGS_SP(regs) = (unsigned long) frame; PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; PT_REGS_EAX(regs) = (unsigned long) sig; PT_REGS_EDX(regs) = (unsigned long) &frame->info; @@ -298,7 +323,11 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) ptrace_notify(SIGTRAP); - return(0); + return 0; + +err: + PT_REGS_SP(regs) = save_sp; + return err; } long sys_sigreturn(struct pt_regs regs) diff --git a/arch/um/sys-x86_64/signal.c b/arch/um/sys-x86_64/signal.c index fe1d065332b1..e75c4e1838b0 100644 --- a/arch/um/sys-x86_64/signal.c +++ b/arch/um/sys-x86_64/signal.c @@ -55,7 +55,8 @@ static int copy_sc_from_user_skas(struct pt_regs *regs, } int copy_sc_to_user_skas(struct sigcontext *to, struct _fpstate *to_fp, - struct pt_regs *regs, unsigned long mask) + struct pt_regs *regs, unsigned long mask, + unsigned long sp) { struct faultinfo * fi = ¤t->thread.arch.faultinfo; int err = 0; @@ -70,7 +71,11 @@ int copy_sc_to_user_skas(struct sigcontext *to, struct _fpstate *to_fp, err |= PUTREG(regs, RDI, to, rdi); err |= PUTREG(regs, RSI, to, rsi); err |= PUTREG(regs, RBP, to, rbp); - err |= PUTREG(regs, RSP, to, rsp); + /* Must use orignal RSP, which is passed in, rather than what's in + * the pt_regs, because that's already been updated to point at the + * signal frame. + */ + err |= __put_user(sp, &to->rsp); err |= PUTREG(regs, RBX, to, rbx); err |= PUTREG(regs, RDX, to, rdx); err |= PUTREG(regs, RCX, to, rcx); @@ -102,7 +107,7 @@ int copy_sc_to_user_skas(struct sigcontext *to, struct _fpstate *to_fp, #ifdef CONFIG_MODE_TT int copy_sc_from_user_tt(struct sigcontext *to, struct sigcontext *from, - int fpsize) + int fpsize) { struct _fpstate *to_fp, *from_fp; unsigned long sigs; @@ -120,7 +125,7 @@ int copy_sc_from_user_tt(struct sigcontext *to, struct sigcontext *from, } int copy_sc_to_user_tt(struct sigcontext *to, struct _fpstate *fp, - struct sigcontext *from, int fpsize) + struct sigcontext *from, int fpsize, unsigned long sp) { struct _fpstate *to_fp, *from_fp; int err; @@ -128,11 +133,17 @@ int copy_sc_to_user_tt(struct sigcontext *to, struct _fpstate *fp, to_fp = (fp ? fp : (struct _fpstate *) (to + 1)); from_fp = from->fpstate; err = copy_to_user(to, from, sizeof(*to)); + /* The SP in the sigcontext is the updated one for the signal + * delivery. The sp passed in is the original, and this needs + * to be restored, so we stick it in separately. + */ + err |= copy_to_user(&SC_SP(to), sp, sizeof(sp)); + if(from_fp != NULL){ err |= copy_to_user(&to->fpstate, &to_fp, sizeof(to->fpstate)); err |= copy_to_user(to_fp, from_fp, fpsize); } - return(err); + return err; } #endif @@ -148,11 +159,12 @@ static int copy_sc_from_user(struct pt_regs *to, void __user *from) } static int copy_sc_to_user(struct sigcontext *to, struct _fpstate *fp, - struct pt_regs *from, unsigned long mask) + struct pt_regs *from, unsigned long mask, + unsigned long sp) { return(CHOOSE_MODE(copy_sc_to_user_tt(to, fp, UPT_SC(&from->regs), - sizeof(*fp)), - copy_sc_to_user_skas(to, fp, from, mask))); + sizeof(*fp), sp), + copy_sc_to_user_skas(to, fp, from, mask, sp))); } struct rt_sigframe @@ -170,6 +182,7 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, { struct rt_sigframe __user *frame; struct _fpstate __user *fp = NULL; + unsigned long save_sp = PT_REGS_RSP(regs); int err = 0; struct task_struct *me = current; @@ -193,14 +206,25 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, goto out; } + /* Update SP now because the page fault handler refuses to extend + * the stack if the faulting address is too far below the current + * SP, which frame now certainly is. If there's an error, the original + * value is restored on the way out. + * When writing the sigcontext to the stack, we have to write the + * original value, so that's passed to copy_sc_to_user, which does + * the right thing with it. + */ + PT_REGS_RSP(regs) = (unsigned long) frame; + /* Create the ucontext. */ err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(PT_REGS_SP(regs)), + err |= __put_user(sas_ss_flags(save_sp), &frame->uc.uc_stack.ss_flags); err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= copy_sc_to_user(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); + err |= copy_sc_to_user(&frame->uc.uc_mcontext, fp, regs, set->sig[0], + save_sp); err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); if (sizeof(*set) == 16) { __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); @@ -217,10 +241,10 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); else /* could use a vstub here */ - goto out; + goto restore_sp; if (err) - goto out; + goto restore_sp; /* Set up registers for signal handler */ { @@ -238,10 +262,12 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, PT_REGS_RSI(regs) = (unsigned long) &frame->info; PT_REGS_RDX(regs) = (unsigned long) &frame->uc; PT_REGS_RIP(regs) = (unsigned long) ka->sa.sa_handler; - - PT_REGS_RSP(regs) = (unsigned long) frame; out: - return(err); + return err; + +restore_sp: + PT_REGS_RSP(regs) = save_sp; + return err; } long sys_rt_sigreturn(struct pt_regs *regs) From 1fbbd6844e6a84e5d166ab475dc298d3b89fa4bf Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 27 Mar 2006 01:14:39 -0800 Subject: [PATCH 042/306] [PATCH] uml: prevent umid theft Behavior when booting two UMLs with the same umid was broken. The second one would steal the umid. This fixes that, making the second UML take a random umid instead. Signed-off-by: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/os-Linux/umid.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/arch/um/os-Linux/umid.c b/arch/um/os-Linux/umid.c index ecf107ae5ac8..198e59163288 100644 --- a/arch/um/os-Linux/umid.c +++ b/arch/um/os-Linux/umid.c @@ -143,8 +143,10 @@ static int not_dead_yet(char *dir) goto out_close; } - if((kill(p, 0) == 0) || (errno != ESRCH)) + if((kill(p, 0) == 0) || (errno != ESRCH)){ + printk("umid \"%s\" is already in use by pid %d\n", umid, p); return 1; + } err = actually_do_remove(dir); if(err) @@ -234,33 +236,44 @@ int __init make_umid(void) err = mkdir(tmp, 0777); if(err < 0){ err = -errno; - if(errno != EEXIST) + if(err != -EEXIST) goto err; - if(not_dead_yet(tmp) < 0) + /* 1 -> this umid is already in use + * < 0 -> we couldn't remove the umid directory + * In either case, we can't use this umid, so return -EEXIST. + */ + if(not_dead_yet(tmp) != 0) goto err; err = mkdir(tmp, 0777); } - if(err < 0){ - printk("Failed to create '%s' - err = %d\n", umid, err); - goto err_rmdir; + if(err){ + err = -errno; + printk("Failed to create '%s' - err = %d\n", umid, -errno); + goto err; } umid_setup = 1; create_pid_file(); - return 0; - - err_rmdir: - rmdir(tmp); + err = 0; err: return err; } static int __init make_umid_init(void) { + if(!make_umid()) + return 0; + + /* If initializing with the given umid failed, then try again with + * a random one. + */ + printk("Failed to initialize umid \"%s\", trying with a random umid\n", + umid); + *umid = '\0'; make_umid(); return 0; From 5f4e8fd08f3993bc31a7dd91767fdb4da4fe6278 Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 27 Mar 2006 01:14:40 -0800 Subject: [PATCH 043/306] [PATCH] uml: fix thread startup race This fixes a race in the starting of write_sigio_thread. Previously, some of the data needed by the thread was initialized after the clone. If the thread ran immediately, it would see the uninitialized data, including an empty pollfds, which would cause it to hang. We move the data initialization to before the clone, and adjust the error paths and cleanup accordingly. Signed-off-by: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/os-Linux/sigio.c | 57 ++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c index 7604c404c4c2..9ba942947146 100644 --- a/arch/um/os-Linux/sigio.c +++ b/arch/um/os-Linux/sigio.c @@ -29,8 +29,10 @@ static int write_sigio_pid = -1; * the descriptors closed after it is killed. So, it can't see them change. * On the UML side, they are changed under the sigio_lock. */ -static int write_sigio_fds[2] = { -1, -1 }; -static int sigio_private[2] = { -1, -1 }; +#define SIGIO_FDS_INIT {-1, -1} + +static int write_sigio_fds[2] = SIGIO_FDS_INIT; +static int sigio_private[2] = SIGIO_FDS_INIT; struct pollfds { struct pollfd *poll; @@ -270,7 +272,17 @@ void write_sigio_workaround(void) /* Did we race? Don't try to optimize this, please, it's not so likely * to happen, and no more than once at the boot. */ if(write_sigio_pid != -1) - goto out_unlock; + goto out_free; + + current_poll = ((struct pollfds) { .poll = p, + .used = 1, + .size = 1 }); + + if (write_sigio_irq(l_write_sigio_fds[0])) + goto out_clear_poll; + + memcpy(write_sigio_fds, l_write_sigio_fds, sizeof(l_write_sigio_fds)); + memcpy(sigio_private, l_sigio_private, sizeof(l_sigio_private)); write_sigio_pid = run_helper_thread(write_sigio_thread, NULL, CLONE_FILES | CLONE_VM, &stack, 0); @@ -278,41 +290,28 @@ void write_sigio_workaround(void) if (write_sigio_pid < 0) goto out_clear; - if (write_sigio_irq(l_write_sigio_fds[0])) - goto out_kill; - - /* Success, finally. */ - memcpy(write_sigio_fds, l_write_sigio_fds, sizeof(l_write_sigio_fds)); - memcpy(sigio_private, l_sigio_private, sizeof(l_sigio_private)); - - current_poll = ((struct pollfds) { .poll = p, - .used = 1, - .size = 1 }); - sigio_unlock(); return; - out_kill: - l_write_sigio_pid = write_sigio_pid; +out_clear: write_sigio_pid = -1; - sigio_unlock(); - /* Going to call waitpid, avoid holding the lock. */ - os_kill_process(l_write_sigio_pid, 1); - goto out_free; - - out_clear: - write_sigio_pid = -1; - out_unlock: - sigio_unlock(); - out_free: + write_sigio_fds[0] = -1; + write_sigio_fds[1] = -1; + sigio_private[0] = -1; + sigio_private[1] = -1; +out_clear_poll: + current_poll = ((struct pollfds) { .poll = NULL, + .size = 0, + .used = 0 }); +out_free: kfree(p); - out_close2: + sigio_unlock(); +out_close2: close(l_sigio_private[0]); close(l_sigio_private[1]); - out_close1: +out_close1: close(l_write_sigio_fds[0]); close(l_write_sigio_fds[1]); - return; } void sigio_cleanup(void) From 86c79cbcee7d617c5aaf9b4f7e6cc093397d3451 Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 27 Mar 2006 01:14:41 -0800 Subject: [PATCH 044/306] [PATCH] uml: fix hostfs stack corruption Noted by Oleg Drokin: We initialized an extra slot of struct kstatfs.spare, sometimes causing stack corruption. Signed-off-by: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Cc: Oleg Drokin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hostfs/hostfs_user.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index b97809deba66..23b7cee72123 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -360,7 +360,6 @@ int do_statfs(char *root, long *bsize_out, long long *blocks_out, spare_out[2] = buf.f_spare[2]; spare_out[3] = buf.f_spare[3]; spare_out[4] = buf.f_spare[4]; - spare_out[5] = buf.f_spare[5]; return(0); } From 718c604a28e35848754a65b839e4877ec34b2fca Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Mar 2006 01:14:42 -0800 Subject: [PATCH 045/306] [PATCH] autofs4: lookup white space cleanup Whitespace and formating changes to lookup code. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/root.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 62d8d4acb8bb..2c676bd44acd 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -296,20 +296,20 @@ static int try_to_fill_dentry(struct vfsmount *mnt, struct dentry *dentry, int f { struct super_block *sb = mnt->mnt_sb; struct autofs_sb_info *sbi = autofs4_sbi(sb); - struct autofs_info *de_info = autofs4_dentry_ino(dentry); + struct autofs_info *ino = autofs4_dentry_ino(dentry); int status = 0; /* Block on any pending expiry here; invalidate the dentry when expiration is done to trigger mount request with a new dentry */ - if (de_info && (de_info->flags & AUTOFS_INF_EXPIRING)) { + if (ino && (ino->flags & AUTOFS_INF_EXPIRING)) { DPRINTK("waiting for expire %p name=%.*s", dentry, dentry->d_name.len, dentry->d_name.name); status = autofs4_wait(sbi, dentry, NFY_NONE); - + DPRINTK("expire done status=%d", status); - + /* * If the directory still exists the mount request must * continue otherwise it can't be followed at the right @@ -323,18 +323,21 @@ static int try_to_fill_dentry(struct vfsmount *mnt, struct dentry *dentry, int f DPRINTK("dentry=%p %.*s ino=%p", dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - /* Wait for a pending mount, triggering one if there isn't one already */ + /* + * Wait for a pending mount, triggering one if there + * isn't one already + */ if (dentry->d_inode == NULL) { DPRINTK("waiting for mount name=%.*s", dentry->d_name.len, dentry->d_name.name); status = autofs4_wait(sbi, dentry, NFY_MOUNT); - + DPRINTK("mount done status=%d", status); if (status && dentry->d_inode) return 0; /* Try to get the kernel to invalidate this dentry */ - + /* Turn this into a real negative dentry? */ if (status == -ENOENT) { dentry->d_time = jiffies + AUTOFS_NEGATIVE_TIMEOUT; @@ -367,8 +370,10 @@ static int try_to_fill_dentry(struct vfsmount *mnt, struct dentry *dentry, int f } } - /* We don't update the usages for the autofs daemon itself, this - is necessary for recursive autofs mounts */ + /* + * We don't update the usages for the autofs daemon itself, this + * is necessary for recursive autofs mounts + */ if (!autofs4_oz_mode(sbi)) autofs4_update_usage(mnt, dentry); @@ -384,9 +389,9 @@ static int try_to_fill_dentry(struct vfsmount *mnt, struct dentry *dentry, int f * yet completely filled in, and revalidate has to delay such * lookups.. */ -static int autofs4_revalidate(struct dentry * dentry, struct nameidata *nd) +static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct inode * dir = dentry->d_parent->d_inode; + struct inode *dir = dentry->d_parent->d_inode; struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); int oz_mode = autofs4_oz_mode(sbi); int flags = nd ? nd->flags : 0; @@ -462,12 +467,13 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s DPRINTK("name = %.*s", dentry->d_name.len, dentry->d_name.name); + /* File name too long to exist */ if (dentry->d_name.len > NAME_MAX) - return ERR_PTR(-ENAMETOOLONG);/* File name too long to exist */ + return ERR_PTR(-ENAMETOOLONG); sbi = autofs4_sbi(dir->i_sb); - oz_mode = autofs4_oz_mode(sbi); + DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", current->pid, process_group(current), sbi->catatonic, oz_mode); @@ -519,7 +525,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s * doesn't do the right thing for all system calls, but it should * be OK for the operations we permit from an autofs. */ - if ( dentry->d_inode && d_unhashed(dentry) ) + if (dentry->d_inode && d_unhashed(dentry)) return ERR_PTR(-ENOENT); return NULL; From f360ce3be466d50de153b001b24561ca7593042b Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Mar 2006 01:14:43 -0800 Subject: [PATCH 046/306] [PATCH] autofs4: use libfs routines for readdir Change readdir routines to use the cursor based routines in libfs.c. This removes reliance on old readdir code from 2.4 and should improve efficiency of readdir in autofs4. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/root.c | 126 +++++++++++++++------------------------------- 1 file changed, 40 insertions(+), 86 deletions(-) diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 2c676bd44acd..af9a4c6bbadf 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -30,7 +30,6 @@ static int autofs4_dir_close(struct inode *inode, struct file *file); static int autofs4_dir_readdir(struct file * filp, void * dirent, filldir_t filldir); static int autofs4_root_readdir(struct file * filp, void * dirent, filldir_t filldir); static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); -static int autofs4_dcache_readdir(struct file *, void *, filldir_t); struct file_operations autofs4_root_operations = { .open = dcache_dir_open, @@ -82,7 +81,7 @@ static int autofs4_root_readdir(struct file *file, void *dirent, DPRINTK("needs_reghost = %d", sbi->needs_reghost); - return autofs4_dcache_readdir(file, dirent, filldir); + return dcache_readdir(file, dirent, filldir); } /* Update usage from here to top of tree, so that scan of @@ -103,75 +102,21 @@ static void autofs4_update_usage(struct vfsmount *mnt, struct dentry *dentry) spin_unlock(&dcache_lock); } -/* - * From 2.4 kernel readdir.c - */ -static int autofs4_dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) -{ - int i; - struct dentry *dentry = filp->f_dentry; - - i = filp->f_pos; - switch (i) { - case 0: - if (filldir(dirent, ".", 1, i, dentry->d_inode->i_ino, DT_DIR) < 0) - break; - i++; - filp->f_pos++; - /* fallthrough */ - case 1: - if (filldir(dirent, "..", 2, i, dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) - break; - i++; - filp->f_pos++; - /* fallthrough */ - default: { - struct list_head *list; - int j = i-2; - - spin_lock(&dcache_lock); - list = dentry->d_subdirs.next; - - for (;;) { - if (list == &dentry->d_subdirs) { - spin_unlock(&dcache_lock); - return 0; - } - if (!j) - break; - j--; - list = list->next; - } - - while(1) { - struct dentry *de = list_entry(list, - struct dentry, d_u.d_child); - - if (!d_unhashed(de) && de->d_inode) { - spin_unlock(&dcache_lock); - if (filldir(dirent, de->d_name.name, de->d_name.len, filp->f_pos, de->d_inode->i_ino, DT_UNKNOWN) < 0) - break; - spin_lock(&dcache_lock); - } - filp->f_pos++; - list = list->next; - if (list != &dentry->d_subdirs) - continue; - spin_unlock(&dcache_lock); - break; - } - } - } - return 0; -} - static int autofs4_dir_open(struct inode *inode, struct file *file) { struct dentry *dentry = file->f_dentry; struct vfsmount *mnt = file->f_vfsmnt; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct dentry *cursor; int status; + status = dcache_dir_open(inode, file); + if (status) + goto out; + + cursor = file->private_data; + cursor->d_fsdata = NULL; + DPRINTK("file=%p dentry=%p %.*s", file, dentry, dentry->d_name.len, dentry->d_name.name); @@ -180,12 +125,15 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) if (autofs4_ispending(dentry)) { DPRINTK("dentry busy"); - return -EBUSY; + dcache_dir_close(inode, file); + status = -EBUSY; + goto out; } + status = -ENOENT; if (!d_mountpoint(dentry) && dentry->d_op && dentry->d_op->d_revalidate) { struct nameidata nd; - int empty; + int empty, ret; /* In case there are stale directory dentrys from a failed mount */ spin_lock(&dcache_lock); @@ -195,13 +143,13 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) if (!empty) d_invalidate(dentry); - nd.dentry = dentry; - nd.mnt = mnt; nd.flags = LOOKUP_DIRECTORY; - status = (dentry->d_op->d_revalidate)(dentry, &nd); + ret = (dentry->d_op->d_revalidate)(dentry, &nd); - if (!status) - return -ENOENT; + if (!ret) { + dcache_dir_close(inode, file); + goto out; + } } if (d_mountpoint(dentry)) { @@ -212,25 +160,29 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) if (!autofs4_follow_mount(&fp_mnt, &fp_dentry)) { dput(fp_dentry); mntput(fp_mnt); - return -ENOENT; + dcache_dir_close(inode, file); + goto out; } fp = dentry_open(fp_dentry, fp_mnt, file->f_flags); status = PTR_ERR(fp); if (IS_ERR(fp)) { - file->private_data = NULL; - return status; + dcache_dir_close(inode, file); + goto out; } - file->private_data = fp; + cursor->d_fsdata = fp; } -out: return 0; +out: + return status; } static int autofs4_dir_close(struct inode *inode, struct file *file) { struct dentry *dentry = file->f_dentry; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct dentry *cursor = file->private_data; + int status = 0; DPRINTK("file=%p dentry=%p %.*s", file, dentry, dentry->d_name.len, dentry->d_name.name); @@ -240,26 +192,28 @@ static int autofs4_dir_close(struct inode *inode, struct file *file) if (autofs4_ispending(dentry)) { DPRINTK("dentry busy"); - return -EBUSY; + status = -EBUSY; + goto out; } if (d_mountpoint(dentry)) { - struct file *fp = file->private_data; - - if (!fp) - return -ENOENT; - + struct file *fp = cursor->d_fsdata; + if (!fp) { + status = -ENOENT; + goto out; + } filp_close(fp, current->files); - file->private_data = NULL; } out: - return 0; + dcache_dir_close(inode, file); + return status; } static int autofs4_dir_readdir(struct file *file, void *dirent, filldir_t filldir) { struct dentry *dentry = file->f_dentry; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct dentry *cursor = file->private_data; int status; DPRINTK("file=%p dentry=%p %.*s", @@ -274,7 +228,7 @@ static int autofs4_dir_readdir(struct file *file, void *dirent, filldir_t filldi } if (d_mountpoint(dentry)) { - struct file *fp = file->private_data; + struct file *fp = cursor->d_fsdata; if (!fp) return -ENOENT; @@ -289,7 +243,7 @@ static int autofs4_dir_readdir(struct file *file, void *dirent, filldir_t filldi return status; } out: - return autofs4_dcache_readdir(file, dirent, filldir); + return dcache_readdir(file, dirent, filldir); } static int try_to_fill_dentry(struct vfsmount *mnt, struct dentry *dentry, int flags) From 2d753e62b87ab2fc72bb4ff5153791d32ff9c08e Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Mar 2006 01:14:44 -0800 Subject: [PATCH 047/306] [PATCH] autofs4: can't mount due to mount point dir not empty Addresse a problem where stale dentrys stop mounts from happening. When a mount point directory is pre-created and a non-existent entry within it is requested a dentry ends up being created within the mount point directory which stops future mounts. The problem is solved by ignoring negative, unhashed dentrys in the mount point d_subdirs list. Additionally the apparent cacheing of -ENOENT returns from requests is removed. The test on d_time is a tautology and d_time is not initialised and has an unexpected value. In short it doesn't do what it's meant to. The cacheing of failed requests to the daemon is important and will be followed up later. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/autofs_i.h | 8 -------- fs/autofs4/root.c | 9 ++++----- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index f54c5b21f876..eea25934da62 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -41,14 +41,6 @@ #define AUTOFS_SUPER_MAGIC 0x0187 -/* - * If the daemon returns a negative response (AUTOFS_IOC_FAIL) then the - * kernel will keep the negative response cached for up to the time given - * here, although the time can be shorter if the kernel throws the dcache - * entry away. This probably should be settable from user space. - */ -#define AUTOFS_NEGATIVE_TIMEOUT (60*HZ) /* 1 minute */ - /* Unified info structure. This is pointed to by both the dentry and inode structures. Each file in the filesystem has an instance of this structure. It holds a reference to the dentry, so dentries are never diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index af9a4c6bbadf..c7ff35774344 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -294,14 +294,13 @@ static int try_to_fill_dentry(struct vfsmount *mnt, struct dentry *dentry, int f /* Turn this into a real negative dentry? */ if (status == -ENOENT) { - dentry->d_time = jiffies + AUTOFS_NEGATIVE_TIMEOUT; spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; spin_unlock(&dentry->d_lock); - return 1; + return 0; } else if (status) { /* Return a negative dentry, but leave it "pending" */ - return 1; + return 0; } /* Trigger mount for path component or follow link */ } else if (flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY) || @@ -360,13 +359,13 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) /* Negative dentry.. invalidate if "old" */ if (dentry->d_inode == NULL) - return (dentry->d_time - jiffies <= AUTOFS_NEGATIVE_TIMEOUT); + return 0; /* Check for a non-mountpoint directory with no contents */ spin_lock(&dcache_lock); if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry) && - list_empty(&dentry->d_subdirs)) { + simple_empty_nolock(dentry)) { DPRINTK("dentry=%p %.*s, emptydir", dentry, dentry->d_name.len, dentry->d_name.name); spin_unlock(&dcache_lock); From 1f5f2c3059ae8cc23cb3303daf324b06ba79517a Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Mar 2006 01:14:44 -0800 Subject: [PATCH 048/306] [PATCH] autofs4: expire code readability cleanup Change the names of the boolean functions autofs4_check_mount and autofs4_check_tree to autofs4_mount_busy and autofs4_tree_busy respectively and alters their return codes to suit in order to aid code readabilty. A couple of white space cleanups are included as well. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/expire.c | 54 ++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index dc39589df165..bcc17f533699 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -16,7 +16,7 @@ static unsigned long now; -/* Check if a dentry can be expired return 1 if it can else return 0 */ +/* Check if a dentry can be expired */ static inline int autofs4_can_expire(struct dentry *dentry, unsigned long timeout, int do_now) { @@ -41,14 +41,13 @@ static inline int autofs4_can_expire(struct dentry *dentry, attempts if expire fails the first time */ ino->last_used = now; } - return 1; } -/* Check a mount point for busyness return 1 if not busy, otherwise */ -static int autofs4_check_mount(struct vfsmount *mnt, struct dentry *dentry) +/* Check a mount point for busyness */ +static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) { - int status = 0; + int status = 1; DPRINTK("dentry %p %.*s", dentry, (int)dentry->d_name.len, dentry->d_name.name); @@ -65,7 +64,7 @@ static int autofs4_check_mount(struct vfsmount *mnt, struct dentry *dentry) /* The big question */ if (may_umount_tree(mnt) == 0) - status = 1; + status = 0; done: DPRINTK("returning = %d", status); mntput(mnt); @@ -75,30 +74,29 @@ done: /* Check a directory tree of mount points for busyness * The tree is not busy iff no mountpoints are busy - * Return 1 if the tree is busy or 0 otherwise */ -static int autofs4_check_tree(struct vfsmount *mnt, - struct dentry *top, - unsigned long timeout, - int do_now) +static int autofs4_tree_busy(struct vfsmount *mnt, + struct dentry *top, + unsigned long timeout, + int do_now) { struct dentry *this_parent = top; struct list_head *next; - DPRINTK("parent %p %.*s", + DPRINTK("top %p %.*s", top, (int)top->d_name.len, top->d_name.name); /* Negative dentry - give up */ if (!simple_positive(top)) - return 0; + return 1; /* Timeout of a tree mount is determined by its top dentry */ if (!autofs4_can_expire(top, timeout, do_now)) - return 0; + return 1; /* Is someone visiting anywhere in the tree ? */ if (may_umount_tree(mnt)) - return 0; + return 1; spin_lock(&dcache_lock); repeat: @@ -126,9 +124,9 @@ resume: if (d_mountpoint(dentry)) { /* First busy => tree busy */ - if (!autofs4_check_mount(mnt, dentry)) { + if (autofs4_mount_busy(mnt, dentry)) { dput(dentry); - return 0; + return 1; } } @@ -144,7 +142,7 @@ resume: } spin_unlock(&dcache_lock); - return 1; + return 0; } static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, @@ -188,7 +186,7 @@ resume: goto cont; /* Can we umount this guy */ - if (autofs4_check_mount(mnt, dentry)) + if (!autofs4_mount_busy(mnt, dentry)) return dentry; } @@ -241,7 +239,7 @@ static struct dentry *autofs4_expire(struct super_block *sb, struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child); /* Negative dentry - give up */ - if ( !simple_positive(dentry) ) { + if (!simple_positive(dentry)) { next = next->next; continue; } @@ -259,21 +257,21 @@ static struct dentry *autofs4_expire(struct super_block *sb, goto next; /* Can we umount this guy */ - if (autofs4_check_mount(mnt, dentry)) { + if (!autofs4_mount_busy(mnt, dentry)) { expired = dentry; break; } goto next; } - if ( simple_empty(dentry) ) + if (simple_empty(dentry)) goto next; /* Case 2: tree mount, expire iff entire tree is not busy */ if (!exp_leaves) { /* Lock the tree as we must expire as a whole */ spin_lock(&sbi->fs_lock); - if (autofs4_check_tree(mnt, dentry, timeout, do_now)) { + if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) { struct autofs_info *inf = autofs4_dentry_ino(dentry); /* Set this flag early to catch sys_chdir and the like */ @@ -297,7 +295,7 @@ next: next = next->next; } - if ( expired ) { + if (expired) { DPRINTK("returning %p %.*s", expired, (int)expired->d_name.len, expired->d_name.name); spin_lock(&dcache_lock); @@ -352,16 +350,16 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, return -EFAULT; if ((dentry = autofs4_expire(sb, mnt, sbi, do_now)) != NULL) { - struct autofs_info *de_info = autofs4_dentry_ino(dentry); + struct autofs_info *ino = autofs4_dentry_ino(dentry); /* This is synchronous because it makes the daemon a little easier */ - de_info->flags |= AUTOFS_INF_EXPIRING; + ino->flags |= AUTOFS_INF_EXPIRING; ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); - de_info->flags &= ~AUTOFS_INF_EXPIRING; + ino->flags &= ~AUTOFS_INF_EXPIRING; dput(dentry); } - + return ret; } From 1ce12bad85863478619688c0c7363f93a9e5edb8 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Mar 2006 01:14:45 -0800 Subject: [PATCH 049/306] [PATCH] autofs4: simplify expire tree traversal Simplify the expire tree traversal code by using a function from namespace.c to calculate the next entry in the top down tree traversals carried out during the expire operation. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/expire.c | 102 +++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 62 deletions(-) diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index bcc17f533699..165fe9e2d570 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -72,6 +72,27 @@ done: return status; } +/* + * Calculate next entry in top down tree traversal. + * From next_mnt in namespace.c - elegant. + */ +static struct dentry *next_dentry(struct dentry *p, struct dentry *root) +{ + struct list_head *next = p->d_subdirs.next; + + if (next == &p->d_subdirs) { + while (1) { + if (p == root) + return NULL; + next = p->d_u.d_child.next; + if (next != &p->d_parent->d_subdirs) + break; + p = p->d_parent; + } + } + return list_entry(next, struct dentry, d_u.d_child); +} + /* Check a directory tree of mount points for busyness * The tree is not busy iff no mountpoints are busy */ @@ -80,8 +101,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt, unsigned long timeout, int do_now) { - struct dentry *this_parent = top; - struct list_head *next; + struct dentry *p; DPRINTK("top %p %.*s", top, (int)top->d_name.len, top->d_name.name); @@ -99,49 +119,28 @@ static int autofs4_tree_busy(struct vfsmount *mnt, return 1; spin_lock(&dcache_lock); -repeat: - next = this_parent->d_subdirs.next; -resume: - while (next != &this_parent->d_subdirs) { - struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child); - + for (p = top; p; p = next_dentry(p, top)) { /* Negative dentry - give up */ - if (!simple_positive(dentry)) { - next = next->next; + if (!simple_positive(p)) continue; - } DPRINTK("dentry %p %.*s", - dentry, (int)dentry->d_name.len, dentry->d_name.name); + p, (int) p->d_name.len, p->d_name.name); - if (!simple_empty_nolock(dentry)) { - this_parent = dentry; - goto repeat; - } - - dentry = dget(dentry); + p = dget(p); spin_unlock(&dcache_lock); - if (d_mountpoint(dentry)) { + if (d_mountpoint(p)) { /* First busy => tree busy */ - if (autofs4_mount_busy(mnt, dentry)) { - dput(dentry); + if (autofs4_mount_busy(mnt, p)) { + dput(p); return 1; } } - - dput(dentry); + dput(p); spin_lock(&dcache_lock); - next = next->next; - } - - if (this_parent != top) { - next = this_parent->d_u.d_child.next; - this_parent = this_parent->d_parent; - goto resume; } spin_unlock(&dcache_lock); - return 0; } @@ -150,59 +149,38 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, unsigned long timeout, int do_now) { - struct dentry *this_parent = parent; - struct list_head *next; + struct dentry *p; DPRINTK("parent %p %.*s", parent, (int)parent->d_name.len, parent->d_name.name); spin_lock(&dcache_lock); -repeat: - next = this_parent->d_subdirs.next; -resume: - while (next != &this_parent->d_subdirs) { - struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child); - + for (p = parent; p; p = next_dentry(p, parent)) { /* Negative dentry - give up */ - if (!simple_positive(dentry)) { - next = next->next; + if (!simple_positive(p)) continue; - } DPRINTK("dentry %p %.*s", - dentry, (int)dentry->d_name.len, dentry->d_name.name); + p, (int) p->d_name.len, p->d_name.name); - if (!list_empty(&dentry->d_subdirs)) { - this_parent = dentry; - goto repeat; - } - - dentry = dget(dentry); + p = dget(p); spin_unlock(&dcache_lock); - if (d_mountpoint(dentry)) { + if (d_mountpoint(p)) { /* Can we expire this guy */ - if (!autofs4_can_expire(dentry, timeout, do_now)) + if (!autofs4_can_expire(p, timeout, do_now)) goto cont; /* Can we umount this guy */ - if (!autofs4_mount_busy(mnt, dentry)) - return dentry; + if (!autofs4_mount_busy(mnt, p)) + return p; } cont: - dput(dentry); + dput(p); spin_lock(&dcache_lock); - next = next->next; - } - - if (this_parent != parent) { - next = this_parent->d_u.d_child.next; - this_parent = this_parent->d_parent; - goto resume; } spin_unlock(&dcache_lock); - return NULL; } From 1aff3c8b0511b5bb54acf7859e0c6ec9ae7287a9 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Mar 2006 01:14:46 -0800 Subject: [PATCH 050/306] [PATCH] autofs4: fix false negative return from expire Fix the case where an expire returns busy on a tree mount when it is in fact not busy. This case was overlooked when the patch to prevent the expiring away of "scaffolding" directories for tree mounts was applied. The problem arises when a tree of mounts is a member of a map with other keys. The current logic will not expire the tree if any other mount in the map is busy. The solution is to maintain a "minimum" use count for each autofs dentry and compare this to the actual dentry usage count during expire. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/autofs_i.h | 1 + fs/autofs4/expire.c | 34 +++++++++++++++++++++++++--------- fs/autofs4/inode.c | 12 +++++++++++- fs/autofs4/root.c | 23 ++++++++++++++++++++++- 4 files changed, 59 insertions(+), 11 deletions(-) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index eea25934da62..00da71d0f32a 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -55,6 +55,7 @@ struct autofs_info { struct autofs_sb_info *sbi; unsigned long last_used; + atomic_t count; mode_t mode; size_t size; diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 165fe9e2d570..053d92a745b9 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -101,6 +101,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt, unsigned long timeout, int do_now) { + struct autofs_info *ino; struct dentry *p; DPRINTK("top %p %.*s", @@ -110,14 +111,6 @@ static int autofs4_tree_busy(struct vfsmount *mnt, if (!simple_positive(top)) return 1; - /* Timeout of a tree mount is determined by its top dentry */ - if (!autofs4_can_expire(top, timeout, do_now)) - return 1; - - /* Is someone visiting anywhere in the tree ? */ - if (may_umount_tree(mnt)) - return 1; - spin_lock(&dcache_lock); for (p = top; p; p = next_dentry(p, top)) { /* Negative dentry - give up */ @@ -130,17 +123,40 @@ static int autofs4_tree_busy(struct vfsmount *mnt, p = dget(p); spin_unlock(&dcache_lock); + /* + * Is someone visiting anywhere in the subtree ? + * If there's no mount we need to check the usage + * count for the autofs dentry. + */ + ino = autofs4_dentry_ino(p); if (d_mountpoint(p)) { - /* First busy => tree busy */ if (autofs4_mount_busy(mnt, p)) { dput(p); return 1; } + } else { + unsigned int ino_count = atomic_read(&ino->count); + + /* allow for dget above and top is already dgot */ + if (p == top) + ino_count += 2; + else + ino_count++; + + if (atomic_read(&p->d_count) > ino_count) { + dput(p); + return 1; + } } dput(p); spin_lock(&dcache_lock); } spin_unlock(&dcache_lock); + + /* Timeout of a tree mount is ultimately determined by its top dentry */ + if (!autofs4_can_expire(top, timeout, do_now)) + return 1; + return 0; } diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 1ad98d48e550..2335b1d6490f 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -46,6 +46,7 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino, ino->size = 0; ino->last_used = jiffies; + atomic_set(&ino->count, 0); ino->sbi = sbi; @@ -64,10 +65,19 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino, void autofs4_free_ino(struct autofs_info *ino) { + struct autofs_info *p_ino; + if (ino->dentry) { ino->dentry->d_fsdata = NULL; - if (ino->dentry->d_inode) + if (ino->dentry->d_inode) { + struct dentry *parent = ino->dentry->d_parent; + if (atomic_dec_and_test(&ino->count)) { + p_ino = autofs4_dentry_ino(parent); + if (p_ino && parent != ino->dentry) + atomic_dec(&p_ino->count); + } dput(ino->dentry); + } ino->dentry = NULL; } if (ino->free) diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index c7ff35774344..d196712c4b94 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -490,6 +490,7 @@ static int autofs4_dir_symlink(struct inode *dir, { struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *p_ino; struct inode *inode; char *cp; @@ -523,6 +524,10 @@ static int autofs4_dir_symlink(struct inode *dir, dentry->d_fsdata = ino; ino->dentry = dget(dentry); + atomic_inc(&ino->count); + p_ino = autofs4_dentry_ino(dentry->d_parent); + if (p_ino && dentry->d_parent != dentry) + atomic_inc(&p_ino->count); ino->inode = inode; dir->i_mtime = CURRENT_TIME; @@ -549,11 +554,17 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *p_ino; /* This allows root to remove symlinks */ if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) return -EACCES; + if (atomic_dec_and_test(&ino->count)) { + p_ino = autofs4_dentry_ino(dentry->d_parent); + if (p_ino && dentry->d_parent != dentry) + atomic_dec(&p_ino->count); + } dput(ino->dentry); dentry->d_inode->i_size = 0; @@ -570,6 +581,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *p_ino; if (!autofs4_oz_mode(sbi)) return -EACCES; @@ -584,8 +596,12 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); + if (atomic_dec_and_test(&ino->count)) { + p_ino = autofs4_dentry_ino(dentry->d_parent); + if (p_ino && dentry->d_parent != dentry) + atomic_dec(&p_ino->count); + } dput(ino->dentry); - dentry->d_inode->i_size = 0; dentry->d_inode->i_nlink = 0; @@ -599,6 +615,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode) { struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *p_ino; struct inode *inode; if ( !autofs4_oz_mode(sbi) ) @@ -621,6 +638,10 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode) dentry->d_fsdata = ino; ino->dentry = dget(dentry); + atomic_inc(&ino->count); + p_ino = autofs4_dentry_ino(dentry->d_parent); + if (p_ino && dentry->d_parent != dentry) + atomic_inc(&p_ino->count); ino->inode = inode; dir->i_nlink++; dir->i_mtime = CURRENT_TIME; From e0a7aae94030b878601eb67686b696de4a3764f0 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Mar 2006 01:14:47 -0800 Subject: [PATCH 051/306] [PATCH] autofs4: expire mounts that hold no (extra) references only Alter the expire semantics that define how "busyness" is determined. Currently a last_used counter is updated on every revalidate from processes other than the mount owner process group. This patch changes that so that an expire candidate is busy only if it has a reference count greater than the expected minimum, such as when there is an open file or working directory in use. This method is the only way that busyness can be established for direct mounts within the new implementation. For consistency the expire semantic is made the same for all mounts. A side effect of the patch is that mounts which remain mounted unessessarily in the presence of some GUI programs that scan the filesystem should now expire. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/expire.c | 36 ++++++++++++++++++++++-------------- fs/autofs4/root.c | 4 ++++ 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 053d92a745b9..6ae2fc8233ff 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -47,6 +47,7 @@ static inline int autofs4_can_expire(struct dentry *dentry, /* Check a mount point for busyness */ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) { + struct dentry *top = dentry; int status = 1; DPRINTK("dentry %p %.*s", @@ -62,9 +63,14 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) if (is_autofs4_dentry(dentry)) goto done; - /* The big question */ - if (may_umount_tree(mnt) == 0) - status = 0; + /* Update the expiry counter if fs is busy */ + if (may_umount_tree(mnt)) { + struct autofs_info *ino = autofs4_dentry_ino(top); + ino->last_used = jiffies; + goto done; + } + + status = 0; done: DPRINTK("returning = %d", status); mntput(mnt); @@ -101,7 +107,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt, unsigned long timeout, int do_now) { - struct autofs_info *ino; + struct autofs_info *top_ino = autofs4_dentry_ino(top); struct dentry *p; DPRINTK("top %p %.*s", @@ -127,14 +133,16 @@ static int autofs4_tree_busy(struct vfsmount *mnt, * Is someone visiting anywhere in the subtree ? * If there's no mount we need to check the usage * count for the autofs dentry. + * If the fs is busy update the expiry counter. */ - ino = autofs4_dentry_ino(p); if (d_mountpoint(p)) { if (autofs4_mount_busy(mnt, p)) { + top_ino->last_used = jiffies; dput(p); return 1; } } else { + struct autofs_info *ino = autofs4_dentry_ino(p); unsigned int ino_count = atomic_read(&ino->count); /* allow for dget above and top is already dgot */ @@ -144,6 +152,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt, ino_count++; if (atomic_read(&p->d_count) > ino_count) { + top_ino->last_used = jiffies; dput(p); return 1; } @@ -183,14 +192,13 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, spin_unlock(&dcache_lock); if (d_mountpoint(p)) { - /* Can we expire this guy */ - if (!autofs4_can_expire(p, timeout, do_now)) + /* Can we umount this guy */ + if (autofs4_mount_busy(mnt, p)) goto cont; - /* Can we umount this guy */ - if (!autofs4_mount_busy(mnt, p)) + /* Can we expire this guy */ + if (autofs4_can_expire(p, timeout, do_now)) return p; - } cont: dput(p); @@ -246,12 +254,12 @@ static struct dentry *autofs4_expire(struct super_block *sb, DPRINTK("checking mountpoint %p %.*s", dentry, (int)dentry->d_name.len, dentry->d_name.name); - /* Can we expire this guy */ - if (!autofs4_can_expire(dentry, timeout, do_now)) + /* Can we umount this guy */ + if (autofs4_mount_busy(mnt, dentry)) goto next; - /* Can we umount this guy */ - if (!autofs4_mount_busy(mnt, dentry)) { + /* Can we expire this guy */ + if (autofs4_can_expire(dentry, timeout, do_now)) { expired = dentry; break; } diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index d196712c4b94..3a4a5b47575c 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -330,6 +330,10 @@ static int try_to_fill_dentry(struct vfsmount *mnt, struct dentry *dentry, int f if (!autofs4_oz_mode(sbi)) autofs4_update_usage(mnt, dentry); + /* Initialize expiry counter after successful mount */ + if (ino) + ino->last_used = jiffies; + spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; spin_unlock(&dentry->d_lock); From 862b110f0132401e422783bf68521ade3dc67578 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Mar 2006 01:14:48 -0800 Subject: [PATCH 052/306] [PATCH] autofs4: remove update_atime unused function Remove the update of i_atime from autofs4 in favour of having VFS update it. i_atime is never used for expire in autofs4. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/root.c | 38 ++++---------------------------------- 1 file changed, 4 insertions(+), 34 deletions(-) diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 3a4a5b47575c..72dca3335e25 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -84,24 +84,6 @@ static int autofs4_root_readdir(struct file *file, void *dirent, return dcache_readdir(file, dirent, filldir); } -/* Update usage from here to top of tree, so that scan of - top-level directories will give a useful result */ -static void autofs4_update_usage(struct vfsmount *mnt, struct dentry *dentry) -{ - struct dentry *top = dentry->d_sb->s_root; - - spin_lock(&dcache_lock); - for(; dentry != top; dentry = dentry->d_parent) { - struct autofs_info *ino = autofs4_dentry_ino(dentry); - - if (ino) { - touch_atime(mnt, dentry); - ino->last_used = jiffies; - } - } - spin_unlock(&dcache_lock); -} - static int autofs4_dir_open(struct inode *inode, struct file *file) { struct dentry *dentry = file->f_dentry; @@ -246,10 +228,9 @@ out: return dcache_readdir(file, dirent, filldir); } -static int try_to_fill_dentry(struct vfsmount *mnt, struct dentry *dentry, int flags) +static int try_to_fill_dentry(struct dentry *dentry, int flags) { - struct super_block *sb = mnt->mnt_sb; - struct autofs_sb_info *sbi = autofs4_sbi(sb); + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); int status = 0; @@ -323,13 +304,6 @@ static int try_to_fill_dentry(struct vfsmount *mnt, struct dentry *dentry, int f } } - /* - * We don't update the usages for the autofs daemon itself, this - * is necessary for recursive autofs mounts - */ - if (!autofs4_oz_mode(sbi)) - autofs4_update_usage(mnt, dentry); - /* Initialize expiry counter after successful mount */ if (ino) ino->last_used = jiffies; @@ -357,7 +331,7 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) /* Pending dentry */ if (autofs4_ispending(dentry)) { if (!oz_mode) - status = try_to_fill_dentry(nd->mnt, dentry, flags); + status = try_to_fill_dentry(dentry, flags); return status; } @@ -374,15 +348,11 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) dentry, dentry->d_name.len, dentry->d_name.name); spin_unlock(&dcache_lock); if (!oz_mode) - status = try_to_fill_dentry(nd->mnt, dentry, flags); + status = try_to_fill_dentry(dentry, flags); return status; } spin_unlock(&dcache_lock); - /* Update the usage list */ - if (!oz_mode) - autofs4_update_usage(nd->mnt, dentry); - return 1; } From d7c4a5f1080a61fd4a3a00ba5f741986f8fb71f0 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Mar 2006 01:14:49 -0800 Subject: [PATCH 053/306] [PATCH] autofs4: add a show mount options for proc filesystem Add show_options method to display autofs4 mount options in the proc filesystem. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/autofs_i.h | 3 +++ fs/autofs4/inode.c | 37 +++++++++++++++++++++++++++++++------ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 00da71d0f32a..d82a019ff8ef 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -87,11 +87,14 @@ struct autofs_wait_queue { struct autofs_sb_info { u32 magic; struct dentry *root; + int pipefd; struct file *pipe; pid_t oz_pgrp; int catatonic; int version; int sub_version; + int min_proto; + int max_proto; unsigned long exp_timeout; int reghost_enabled; int needs_reghost; diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 2335b1d6490f..d9a71dab40fc 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -163,9 +164,26 @@ static void autofs4_put_super(struct super_block *sb) DPRINTK("shutting down"); } +static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) +{ + struct autofs_sb_info *sbi = autofs4_sbi(mnt->mnt_sb); + + if (!sbi) + return 0; + + seq_printf(m, ",fd=%d", sbi->pipefd); + seq_printf(m, ",pgrp=%d", sbi->oz_pgrp); + seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); + seq_printf(m, ",minproto=%d", sbi->min_proto); + seq_printf(m, ",maxproto=%d", sbi->max_proto); + + return 0; +} + static struct super_operations autofs4_sops = { .put_super = autofs4_put_super, .statfs = simple_statfs, + .show_options = autofs4_show_options, }; enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto}; @@ -261,7 +279,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) int pipefd; struct autofs_sb_info *sbi; struct autofs_info *ino; - int minproto, maxproto; sbi = (struct autofs_sb_info *) kmalloc(sizeof(*sbi), GFP_KERNEL); if ( !sbi ) @@ -273,12 +290,15 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) s->s_fs_info = sbi; sbi->magic = AUTOFS_SBI_MAGIC; sbi->root = NULL; + sbi->pipefd = -1; sbi->catatonic = 0; sbi->exp_timeout = 0; sbi->oz_pgrp = process_group(current); sbi->sb = s; sbi->version = 0; sbi->sub_version = 0; + sbi->min_proto = 0; + sbi->max_proto = 0; mutex_init(&sbi->wq_mutex); spin_lock_init(&sbi->fs_lock); sbi->queues = NULL; @@ -311,22 +331,26 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, &sbi->oz_pgrp, - &minproto, &maxproto)) { + &sbi->min_proto, &sbi->max_proto)) { printk("autofs: called with bogus options\n"); goto fail_dput; } /* Couldn't this be tested earlier? */ - if (maxproto < AUTOFS_MIN_PROTO_VERSION || - minproto > AUTOFS_MAX_PROTO_VERSION) { + if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || + sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { printk("autofs: kernel does not match daemon version " "daemon (%d, %d) kernel (%d, %d)\n", - minproto, maxproto, + sbi->min_proto, sbi->max_proto, AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); goto fail_dput; } - sbi->version = maxproto > AUTOFS_MAX_PROTO_VERSION ? AUTOFS_MAX_PROTO_VERSION : maxproto; + /* Establish highest kernel protocol version */ + if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION) + sbi->version = AUTOFS_MAX_PROTO_VERSION; + else + sbi->version = sbi->max_proto; sbi->sub_version = AUTOFS_PROTO_SUBVERSION; DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp); @@ -339,6 +363,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) if ( !pipe->f_op || !pipe->f_op->write ) goto fail_fput; sbi->pipe = pipe; + sbi->pipefd = pipefd; /* * Take a reference to the root dentry so we get a chance to From e77fbddf77c071a5735a4bc63a30649bd64baf14 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Mar 2006 01:14:49 -0800 Subject: [PATCH 054/306] [PATCH] autofs4: white space cleanup for waitq.c Whitespace and formating changes to waitq code. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/waitq.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index be78e9378c03..b0bb9d43bcd9 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -33,7 +33,7 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi) sbi->catatonic = 1; wq = sbi->queues; sbi->queues = NULL; /* Erase all wait queues */ - while ( wq ) { + while (wq) { nwq = wq->next; wq->status = -ENOENT; /* Magic is gone - report failure */ kfree(wq->name); @@ -45,7 +45,6 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi) fput(sbi->pipe); /* Close the pipe */ sbi->pipe = NULL; } - shrink_dcache_sb(sbi->sb); } @@ -165,7 +164,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, int len, status; /* In catatonic mode, we don't wait for nobody */ - if ( sbi->catatonic ) + if (sbi->catatonic) return -ENOENT; name = kmalloc(NAME_MAX + 1, GFP_KERNEL); @@ -190,7 +189,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, break; } - if ( !wq ) { + if (!wq) { /* Can't wait for an expire if there's no mount */ if (notify == NFY_NONE && !d_mountpoint(dentry)) { kfree(name); @@ -200,7 +199,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, /* Create a new wait queue */ wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL); - if ( !wq ) { + if (!wq) { kfree(name); mutex_unlock(&sbi->wq_mutex); return -ENOMEM; @@ -240,14 +239,14 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, /* wq->name is NULL if and only if the lock is already released */ - if ( sbi->catatonic ) { + if (sbi->catatonic) { /* We might have slept, so check again for catatonic mode */ wq->status = -ENOENT; kfree(wq->name); wq->name = NULL; } - if ( wq->name ) { + if (wq->name) { /* Block all but "shutdown" signals while waiting */ sigset_t oldset; unsigned long irqflags; @@ -283,12 +282,12 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok struct autofs_wait_queue *wq, **wql; mutex_lock(&sbi->wq_mutex); - for ( wql = &sbi->queues ; (wq = *wql) != 0 ; wql = &wq->next ) { - if ( wq->wait_queue_token == wait_queue_token ) + for (wql = &sbi->queues ; (wq = *wql) != 0 ; wql = &wq->next) { + if (wq->wait_queue_token == wait_queue_token) break; } - if ( !wq ) { + if (!wq) { mutex_unlock(&sbi->wq_mutex); return -EINVAL; } From 90a59c7cf5dd68b41ffab61dd30eba1ef5746e66 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Mar 2006 01:14:50 -0800 Subject: [PATCH 055/306] [PATCH] autofs4: rename simple_empty_nolock function Rename the function simple_empty_nolock to __simple_empty in line with kernel naming conventions. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/autofs_i.h | 2 +- fs/autofs4/root.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index d82a019ff8ef..bc1b0543d2b6 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -201,7 +201,7 @@ static inline int simple_positive(struct dentry *dentry) return dentry->d_inode && !d_unhashed(dentry); } -static inline int simple_empty_nolock(struct dentry *dentry) +static inline int __simple_empty(struct dentry *dentry) { struct dentry *child; int ret = 0; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 72dca3335e25..dcd4802a5d5f 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -343,7 +343,7 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) spin_lock(&dcache_lock); if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry) && - simple_empty_nolock(dentry)) { + __simple_empty(dentry)) { DPRINTK("dentry=%p %.*s, emptydir", dentry, dentry->d_name.len, dentry->d_name.name); spin_unlock(&dcache_lock); From e3474a8eb38e48dea6690d1fabd75f3c7fd2f93f Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Mar 2006 01:14:51 -0800 Subject: [PATCH 056/306] [PATCH] autofs4: change may_umount* functions to boolean Change the functions may_umount and may_umount_tree to boolean functions to aid code readability. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/dirhash.c | 2 +- fs/autofs4/expire.c | 2 +- fs/autofs4/root.c | 2 +- fs/namespace.c | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c index 5ccfcf26310d..3fded389d06b 100644 --- a/fs/autofs/dirhash.c +++ b/fs/autofs/dirhash.c @@ -92,7 +92,7 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb, ; dput(dentry); - if ( may_umount(mnt) == 0 ) { + if ( may_umount(mnt) ) { mntput(mnt); DPRINTK(("autofs: signaling expire on %s\n", ent->name)); return ent; /* Expirable! */ diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 6ae2fc8233ff..02a218fbde5f 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -64,7 +64,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) goto done; /* Update the expiry counter if fs is busy */ - if (may_umount_tree(mnt)) { + if (!may_umount_tree(mnt)) { struct autofs_info *ino = autofs4_dentry_ino(top); ino->last_used = jiffies; goto done; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index dcd4802a5d5f..26eb1f024866 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -699,7 +699,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) { int status = 0; - if (may_umount(mnt) == 0) + if (may_umount(mnt)) status = 1; DPRINTK("returning %d", status); diff --git a/fs/namespace.c b/fs/namespace.c index e069a4c5e389..bf478addb852 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -459,9 +459,9 @@ int may_umount_tree(struct vfsmount *mnt) spin_unlock(&vfsmount_lock); if (actual_refs > minimum_refs) - return -EBUSY; + return 0; - return 0; + return 1; } EXPORT_SYMBOL(may_umount_tree); @@ -481,10 +481,10 @@ EXPORT_SYMBOL(may_umount_tree); */ int may_umount(struct vfsmount *mnt) { - int ret = 0; + int ret = 1; spin_lock(&vfsmount_lock); if (propagate_mount_busy(mnt, 2)) - ret = -EBUSY; + ret = 0; spin_unlock(&vfsmount_lock); return ret; } From f75ba3ade8a4599d67040a9493d75a864e7b329c Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Mar 2006 01:14:52 -0800 Subject: [PATCH 057/306] [PATCH] autofs4: increase module version Update autofs4 version. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/auto_fs4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/auto_fs4.h b/include/linux/auto_fs4.h index 9343c89d843c..d998ddcf7288 100644 --- a/include/linux/auto_fs4.h +++ b/include/linux/auto_fs4.h @@ -23,7 +23,7 @@ #define AUTOFS_MIN_PROTO_VERSION 3 #define AUTOFS_MAX_PROTO_VERSION 4 -#define AUTOFS_PROTO_SUBVERSION 7 +#define AUTOFS_PROTO_SUBVERSION 10 /* Mask for expire behaviour */ #define AUTOFS_EXP_IMMEDIATE 1 From 051d381259eb57d6074d02a6ba6e90e744f1a29f Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Mar 2006 01:14:53 -0800 Subject: [PATCH 058/306] [PATCH] autofs4: nameidata needs to be up to date for follow_link In order to be able to trigger a mount using the follow_link inode method the nameidata struct that is passed in needs to have the vfsmount of the autofs trigger not its parent. During a path walk if an autofs trigger is mounted on a dentry, when the follow_link method is called, the nameidata struct contains the vfsmount and mountpoint dentry of the parent mount while the dentry that is passed in is the root of the autofs trigger mount. I believe it is impossible to get the vfsmount of the trigger mount, within the follow_link method, when only the parent vfsmount and the root dentry of the trigger mount are known. This patch updates the nameidata struct on entry to __do_follow_link if it detects that it is out of date. It moves the path_to_nameidata to above __do_follow_link to facilitate calling it from there. The dput_path is moved as well as that seemed sensible. No changes are made to these two functions. Signed-off-by: Ian Kent Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 57 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 98dc2e134362..22f6e8d16aa8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -546,33 +546,6 @@ struct path { struct dentry *dentry; }; -static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd) -{ - int error; - void *cookie; - struct dentry *dentry = path->dentry; - - touch_atime(path->mnt, dentry); - nd_set_link(nd, NULL); - - if (path->mnt == nd->mnt) - mntget(path->mnt); - cookie = dentry->d_inode->i_op->follow_link(dentry, nd); - error = PTR_ERR(cookie); - if (!IS_ERR(cookie)) { - char *s = nd_get_link(nd); - error = 0; - if (s) - error = __vfs_follow_link(nd, s); - if (dentry->d_inode->i_op->put_link) - dentry->d_inode->i_op->put_link(dentry, nd, cookie); - } - dput(dentry); - mntput(path->mnt); - - return error; -} - static inline void dput_path(struct path *path, struct nameidata *nd) { dput(path->dentry); @@ -589,6 +562,36 @@ static inline void path_to_nameidata(struct path *path, struct nameidata *nd) nd->dentry = path->dentry; } +static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd) +{ + int error; + void *cookie; + struct dentry *dentry = path->dentry; + + touch_atime(path->mnt, dentry); + nd_set_link(nd, NULL); + + if (path->mnt != nd->mnt) { + path_to_nameidata(path, nd); + dget(dentry); + } + mntget(path->mnt); + cookie = dentry->d_inode->i_op->follow_link(dentry, nd); + error = PTR_ERR(cookie); + if (!IS_ERR(cookie)) { + char *s = nd_get_link(nd); + error = 0; + if (s) + error = __vfs_follow_link(nd, s); + if (dentry->d_inode->i_op->put_link) + dentry->d_inode->i_op->put_link(dentry, nd, cookie); + } + dput(dentry); + mntput(path->mnt); + + return error; +} + /* * This limits recursive symlink follows to 8, while * limiting consecutive symlinks to 40. From 34ca959cfc15cf09ad4da4f31ab034691e51af78 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Mar 2006 01:14:54 -0800 Subject: [PATCH 059/306] [PATCH] autofs4: add v5 follow_link mount trigger method This patch adds a follow_link inode method for the root of an autofs direct mount trigger. It also adds the corresponding mount options and updates the show_mount method. Signed-off-by: Ian Kent Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/autofs_i.h | 8 ++++++ fs/autofs4/inode.c | 52 +++++++++++++++++++++++++++++------ fs/autofs4/root.c | 64 +++++++++++++++++++++++++++++++++++-------- 3 files changed, 103 insertions(+), 21 deletions(-) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index bc1b0543d2b6..ed388a1d8fc4 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -3,6 +3,7 @@ * linux/fs/autofs/autofs_i.h * * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved + * Copyright 2005-2006 Ian Kent * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your @@ -84,6 +85,10 @@ struct autofs_wait_queue { #define AUTOFS_SBI_MAGIC 0x6d4a556d +#define AUTOFS_TYP_INDIRECT 0x0001 +#define AUTOFS_TYP_DIRECT 0x0002 +#define AUTOFS_TYP_OFFSET 0x0004 + struct autofs_sb_info { u32 magic; struct dentry *root; @@ -96,6 +101,7 @@ struct autofs_sb_info { int min_proto; int max_proto; unsigned long exp_timeout; + unsigned int type; int reghost_enabled; int needs_reghost; struct super_block *sb; @@ -162,6 +168,8 @@ int autofs4_expire_multi(struct super_block *, struct vfsmount *, extern struct inode_operations autofs4_symlink_inode_operations; extern struct inode_operations autofs4_dir_inode_operations; extern struct inode_operations autofs4_root_inode_operations; +extern struct inode_operations autofs4_indirect_root_inode_operations; +extern struct inode_operations autofs4_direct_root_inode_operations; extern struct file_operations autofs4_dir_operations; extern struct file_operations autofs4_root_operations; diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index d9a71dab40fc..3801bed94e45 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -3,6 +3,7 @@ * linux/fs/autofs/inode.c * * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 2005-2006 Ian Kent * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your @@ -177,6 +178,13 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, ",minproto=%d", sbi->min_proto); seq_printf(m, ",maxproto=%d", sbi->max_proto); + if (sbi->type & AUTOFS_TYP_OFFSET) + seq_printf(m, ",offset"); + else if (sbi->type & AUTOFS_TYP_DIRECT) + seq_printf(m, ",direct"); + else + seq_printf(m, ",indirect"); + return 0; } @@ -186,7 +194,8 @@ static struct super_operations autofs4_sops = { .show_options = autofs4_show_options, }; -enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto}; +enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, + Opt_indirect, Opt_direct, Opt_offset}; static match_table_t tokens = { {Opt_fd, "fd=%u"}, @@ -195,11 +204,15 @@ static match_table_t tokens = { {Opt_pgrp, "pgrp=%u"}, {Opt_minproto, "minproto=%u"}, {Opt_maxproto, "maxproto=%u"}, + {Opt_indirect, "indirect"}, + {Opt_direct, "direct"}, + {Opt_offset, "offset"}, {Opt_err, NULL} }; static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, - pid_t *pgrp, int *minproto, int *maxproto) + pid_t *pgrp, unsigned int *type, + int *minproto, int *maxproto) { char *p; substring_t args[MAX_OPT_ARGS]; @@ -253,6 +266,15 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, return 1; *maxproto = option; break; + case Opt_indirect: + *type = AUTOFS_TYP_INDIRECT; + break; + case Opt_direct: + *type = AUTOFS_TYP_DIRECT; + break; + case Opt_offset: + *type = AUTOFS_TYP_DIRECT | AUTOFS_TYP_OFFSET; + break; default: return 1; } @@ -271,6 +293,11 @@ static struct autofs_info *autofs4_mkroot(struct autofs_sb_info *sbi) return ino; } +void autofs4_dentry_release(struct dentry *); +static struct dentry_operations autofs4_sb_dentry_operations = { + .d_release = autofs4_dentry_release, +}; + int autofs4_fill_super(struct super_block *s, void *data, int silent) { struct inode * root_inode; @@ -297,6 +324,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->sb = s; sbi->version = 0; sbi->sub_version = 0; + sbi->type = 0; sbi->min_proto = 0; sbi->max_proto = 0; mutex_init(&sbi->wq_mutex); @@ -315,27 +343,31 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) if (!ino) goto fail_free; root_inode = autofs4_get_inode(s, ino); - kfree(ino); if (!root_inode) - goto fail_free; + goto fail_ino; - root_inode->i_op = &autofs4_root_inode_operations; - root_inode->i_fop = &autofs4_root_operations; root = d_alloc_root(root_inode); - pipe = NULL; - if (!root) goto fail_iput; + pipe = NULL; + + root->d_op = &autofs4_sb_dentry_operations; + root->d_fsdata = ino; /* Can this call block? */ if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, - &sbi->oz_pgrp, + &sbi->oz_pgrp, &sbi->type, &sbi->min_proto, &sbi->max_proto)) { printk("autofs: called with bogus options\n"); goto fail_dput; } + root_inode->i_fop = &autofs4_root_operations; + root_inode->i_op = sbi->type & AUTOFS_TYP_DIRECT ? + &autofs4_direct_root_inode_operations : + &autofs4_indirect_root_inode_operations; + /* Couldn't this be tested earlier? */ if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { @@ -391,6 +423,8 @@ fail_dput: fail_iput: printk("autofs: get root dentry failed\n"); iput(root_inode); +fail_ino: + kfree(ino); fail_free: kfree(sbi); fail_unlock: diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 26eb1f024866..3f0048582248 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -4,7 +4,7 @@ * * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved * Copyright 1999-2000 Jeremy Fitzhardinge - * Copyright 2001-2003 Ian Kent + * Copyright 2001-2006 Ian Kent * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your @@ -30,6 +30,7 @@ static int autofs4_dir_close(struct inode *inode, struct file *file); static int autofs4_dir_readdir(struct file * filp, void * dirent, filldir_t filldir); static int autofs4_root_readdir(struct file * filp, void * dirent, filldir_t filldir); static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); +static void *autofs4_follow_link(struct dentry *, struct nameidata *); struct file_operations autofs4_root_operations = { .open = dcache_dir_open, @@ -46,7 +47,7 @@ struct file_operations autofs4_dir_operations = { .readdir = autofs4_dir_readdir, }; -struct inode_operations autofs4_root_inode_operations = { +struct inode_operations autofs4_indirect_root_inode_operations = { .lookup = autofs4_lookup, .unlink = autofs4_dir_unlink, .symlink = autofs4_dir_symlink, @@ -54,6 +55,11 @@ struct inode_operations autofs4_root_inode_operations = { .rmdir = autofs4_dir_rmdir, }; +struct inode_operations autofs4_direct_root_inode_operations = { + .lookup = autofs4_lookup, + .follow_link = autofs4_follow_link, +}; + struct inode_operations autofs4_dir_inode_operations = { .lookup = autofs4_lookup, .unlink = autofs4_dir_unlink, @@ -252,7 +258,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags) */ status = d_invalidate(dentry); if (status != -EBUSY) - return 0; + return -ENOENT; } DPRINTK("dentry=%p %.*s ino=%p", @@ -271,17 +277,17 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags) DPRINTK("mount done status=%d", status); if (status && dentry->d_inode) - return 0; /* Try to get the kernel to invalidate this dentry */ + return status; /* Try to get the kernel to invalidate this dentry */ /* Turn this into a real negative dentry? */ if (status == -ENOENT) { spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; spin_unlock(&dentry->d_lock); - return 0; + return status; } else if (status) { /* Return a negative dentry, but leave it "pending" */ - return 0; + return status; } /* Trigger mount for path component or follow link */ } else if (flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY) || @@ -300,7 +306,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags) spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; spin_unlock(&dentry->d_lock); - return 0; + return status; } } @@ -311,7 +317,41 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags) spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; spin_unlock(&dentry->d_lock); - return 1; + return status; +} + +/* For autofs direct mounts the follow link triggers the mount */ +static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + int oz_mode = autofs4_oz_mode(sbi); + unsigned int lookup_type; + int status; + + DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d", + dentry, dentry->d_name.len, dentry->d_name.name, oz_mode, + nd->flags); + + /* If it's our master or we shouldn't trigger a mount we're done */ + lookup_type = nd->flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY); + if (oz_mode || !lookup_type) + goto done; + + status = try_to_fill_dentry(dentry, 0); + if (status) + goto out_error; + + if (!autofs4_follow_mount(&nd->mnt, &nd->dentry)) { + status = -ENOENT; + goto out_error; + } + +done: + return NULL; + +out_error: + path_release(nd); + return ERR_PTR(status); } /* @@ -326,13 +366,13 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); int oz_mode = autofs4_oz_mode(sbi); int flags = nd ? nd->flags : 0; - int status = 1; + int status = 0; /* Pending dentry */ if (autofs4_ispending(dentry)) { if (!oz_mode) status = try_to_fill_dentry(dentry, flags); - return status; + return !status; } /* Negative dentry.. invalidate if "old" */ @@ -349,14 +389,14 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) spin_unlock(&dcache_lock); if (!oz_mode) status = try_to_fill_dentry(dentry, flags); - return status; + return !status; } spin_unlock(&dcache_lock); return 1; } -static void autofs4_dentry_release(struct dentry *de) +void autofs4_dentry_release(struct dentry *de) { struct autofs_info *inf; From 3a15e2ab5d6e79a79291734ac24f33d51c0ae389 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Mar 2006 01:14:55 -0800 Subject: [PATCH 060/306] [PATCH] autofs4: add v5 expire logic This patch adds expire logic for autofs direct mounts. Signed-off-by: Ian Kent Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/expire.c | 96 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 87 insertions(+), 9 deletions(-) diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 02a218fbde5f..8fd92eaf936d 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -4,7 +4,7 @@ * * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved * Copyright 1999-2000 Jeremy Fitzhardinge - * Copyright 2001-2003 Ian Kent + * Copyright 2001-2006 Ian Kent * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your @@ -99,6 +99,39 @@ static struct dentry *next_dentry(struct dentry *p, struct dentry *root) return list_entry(next, struct dentry, d_u.d_child); } +/* + * Check a direct mount point for busyness. + * Direct mounts have similar expiry semantics to tree mounts. + * The tree is not busy iff no mountpoints are busy and there are no + * autofs submounts. + */ +static int autofs4_direct_busy(struct vfsmount *mnt, + struct dentry *top, + unsigned long timeout, + int do_now) +{ + DPRINTK("top %p %.*s", + top, (int) top->d_name.len, top->d_name.name); + + /* Not a mountpoint - give up */ + if (!d_mountpoint(top)) + return 1; + + /* If it's busy update the expiry counters */ + if (!may_umount_tree(mnt)) { + struct autofs_info *ino = autofs4_dentry_ino(top); + if (ino) + ino->last_used = jiffies; + return 1; + } + + /* Timeout of a direct mount is determined by its top dentry */ + if (!autofs4_can_expire(top, timeout, do_now)) + return 1; + + return 0; +} + /* Check a directory tree of mount points for busyness * The tree is not busy iff no mountpoints are busy */ @@ -208,16 +241,48 @@ cont: return NULL; } +/* Check if we can expire a direct mount (possibly a tree) */ +static struct dentry *autofs4_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) +{ + unsigned long timeout; + struct dentry *root = dget(sb->s_root); + int do_now = how & AUTOFS_EXP_IMMEDIATE; + + if (!sbi->exp_timeout || !root) + return NULL; + + now = jiffies; + timeout = sbi->exp_timeout; + + /* Lock the tree as we must expire as a whole */ + spin_lock(&sbi->fs_lock); + if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + struct autofs_info *ino = autofs4_dentry_ino(root); + + /* Set this flag early to catch sys_chdir and the like */ + ino->flags |= AUTOFS_INF_EXPIRING; + spin_unlock(&sbi->fs_lock); + return root; + } + spin_unlock(&sbi->fs_lock); + dput(root); + + return NULL; +} + /* * Find an eligible tree to time-out * A tree is eligible if :- * - it is unused by any user process * - it has been unused for exp_timeout time */ -static struct dentry *autofs4_expire(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) +static struct dentry *autofs4_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) { unsigned long timeout; struct dentry *root = sb->s_root; @@ -249,7 +314,12 @@ static struct dentry *autofs4_expire(struct super_block *sb, dentry = dget(dentry); spin_unlock(&dcache_lock); - /* Case 1: indirect mount or top level direct mount */ + /* + * Case 1: (i) indirect mount or top level pseudo direct mount + * (autofs-4.1). + * (ii) indirect mount with offset mount, check the "/" + * offset (autofs-5.0+). + */ if (d_mountpoint(dentry)) { DPRINTK("checking mountpoint %p %.*s", dentry, (int)dentry->d_name.len, dentry->d_name.name); @@ -283,7 +353,10 @@ static struct dentry *autofs4_expire(struct super_block *sb, break; } spin_unlock(&sbi->fs_lock); - /* Case 3: direct mount, expire individual leaves */ + /* + * Case 3: pseudo direct mount, expire individual leaves + * (autofs-4.1). + */ } else { expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); if (expired) { @@ -325,7 +398,7 @@ int autofs4_expire_run(struct super_block *sb, pkt.hdr.proto_version = sbi->version; pkt.hdr.type = autofs_ptype_expire; - if ((dentry = autofs4_expire(sb, mnt, sbi, 0)) == NULL) + if ((dentry = autofs4_expire_indirect(sb, mnt, sbi, 0)) == NULL) return -EAGAIN; pkt.len = dentry->d_name.len; @@ -351,7 +424,12 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, if (arg && get_user(do_now, arg)) return -EFAULT; - if ((dentry = autofs4_expire(sb, mnt, sbi, do_now)) != NULL) { + if (sbi->type & AUTOFS_TYP_DIRECT) + dentry = autofs4_expire_direct(sb, mnt, sbi, do_now); + else + dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now); + + if (dentry) { struct autofs_info *ino = autofs4_dentry_ino(dentry); /* This is synchronous because it makes the daemon a From 5c0a32fc2cd0be912511199449a37a4a6f0f582d Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Mar 2006 01:14:55 -0800 Subject: [PATCH 061/306] [PATCH] autofs4: add new packet type for v5 communications This patch define a new autofs packet for autofs v5 and updates the waitq.c functions to handle the additional packet type. Signed-off-by: Ian Kent Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/autofs_i.h | 23 +++++++---- fs/autofs4/waitq.c | 86 ++++++++++++++++++++++++++++++++++------ include/linux/auto_fs4.h | 51 +++++++++++++++++++++--- 3 files changed, 136 insertions(+), 24 deletions(-) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index ed388a1d8fc4..37c8d909d1e9 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -77,6 +77,12 @@ struct autofs_wait_queue { int hash; int len; char *name; + u32 dev; + u64 ino; + uid_t uid; + gid_t gid; + pid_t pid; + pid_t tgid; /* This is for status reporting upon return */ int status; atomic_t notified; @@ -180,13 +186,6 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *, struct autofs_sb_info /* Queue management functions */ -enum autofs_notify -{ - NFY_NONE, - NFY_MOUNT, - NFY_EXPIRE -}; - int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify); int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int); void autofs4_catatonic_mode(struct autofs_sb_info *); @@ -204,6 +203,16 @@ static inline int autofs4_follow_mount(struct vfsmount **mnt, struct dentry **de return res; } +static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi) +{ + return new_encode_dev(sbi->sb->s_dev); +} + +static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi) +{ + return sbi->sb->s_root->d_inode->i_ino; +} + static inline int simple_positive(struct dentry *dentry) { return dentry->d_inode && !d_unhashed(dentry); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index b0bb9d43bcd9..12da2c977b0a 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -3,7 +3,7 @@ * linux/fs/autofs/waitq.c * * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * Copyright 2001-2003 Ian Kent + * Copyright 2001-2006 Ian Kent * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your @@ -97,7 +97,10 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pkt.hdr.proto_version = sbi->version; pkt.hdr.type = type; - if (type == autofs_ptype_missing) { + switch (type) { + /* Kernel protocol v4 missing and expire packets */ + case autofs_ptype_missing: + { struct autofs_packet_missing *mp = &pkt.missing; pktsz = sizeof(*mp); @@ -106,7 +109,10 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, mp->len = wq->len; memcpy(mp->name, wq->name, wq->len); mp->name[wq->len] = '\0'; - } else if (type == autofs_ptype_expire_multi) { + break; + } + case autofs_ptype_expire_multi: + { struct autofs_packet_expire_multi *ep = &pkt.expire_multi; pktsz = sizeof(*ep); @@ -115,7 +121,34 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, ep->len = wq->len; memcpy(ep->name, wq->name, wq->len); ep->name[wq->len] = '\0'; - } else { + break; + } + /* + * Kernel protocol v5 packet for handling indirect and direct + * mount missing and expire requests + */ + case autofs_ptype_missing_indirect: + case autofs_ptype_expire_indirect: + case autofs_ptype_missing_direct: + case autofs_ptype_expire_direct: + { + struct autofs_v5_packet *packet = &pkt.v5_packet; + + pktsz = sizeof(*packet); + + packet->wait_queue_token = wq->wait_queue_token; + packet->len = wq->len; + memcpy(packet->name, wq->name, wq->len); + packet->name[wq->len] = '\0'; + packet->dev = wq->dev; + packet->ino = wq->ino; + packet->uid = wq->uid; + packet->gid = wq->gid; + packet->pid = wq->pid; + packet->tgid = wq->tgid; + break; + } + default: printk("autofs4_notify_daemon: bad type %d!\n", type); return; } @@ -161,7 +194,9 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, { struct autofs_wait_queue *wq; char *name; - int len, status; + unsigned int len = 0; + unsigned int hash = 0; + int status; /* In catatonic mode, we don't wait for nobody */ if (sbi->catatonic) @@ -171,11 +206,17 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, if (!name) return -ENOMEM; - len = autofs4_getpath(sbi, dentry, &name); - if (!len) { - kfree(name); - return -ENOENT; + /* If this is a direct mount request create a dummy name */ + if (IS_ROOT(dentry) && (sbi->type & AUTOFS_TYP_DIRECT)) + len = sprintf(name, "%p", dentry); + else { + len = autofs4_getpath(sbi, dentry, &name); + if (!len) { + kfree(name); + return -ENOENT; + } } + hash = full_name_hash(name, len); if (mutex_lock_interruptible(&sbi->wq_mutex)) { kfree(name); @@ -211,9 +252,15 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, wq->next = sbi->queues; sbi->queues = wq; init_waitqueue_head(&wq->queue); - wq->hash = dentry->d_name.hash; + wq->hash = hash; wq->name = name; wq->len = len; + wq->dev = autofs4_get_dev(sbi); + wq->ino = autofs4_get_ino(sbi); + wq->uid = current->uid; + wq->gid = current->gid; + wq->pid = current->pid; + wq->tgid = current->tgid; wq->status = -EINTR; /* Status return if interrupted */ atomic_set(&wq->wait_ctr, 2); atomic_set(&wq->notified, 1); @@ -227,8 +274,23 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, } if (notify != NFY_NONE && atomic_dec_and_test(&wq->notified)) { - int type = (notify == NFY_MOUNT ? - autofs_ptype_missing : autofs_ptype_expire_multi); + int type; + + if (sbi->version < 5) { + if (notify == NFY_MOUNT) + type = autofs_ptype_missing; + else + type = autofs_ptype_expire_multi; + } else { + if (notify == NFY_MOUNT) + type = (sbi->type & AUTOFS_TYP_DIRECT) ? + autofs_ptype_missing_direct : + autofs_ptype_missing_indirect; + else + type = (sbi->type & AUTOFS_TYP_DIRECT) ? + autofs_ptype_expire_direct : + autofs_ptype_expire_indirect; + } DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", (unsigned long) wq->wait_queue_token, wq->len, wq->name, notify); diff --git a/include/linux/auto_fs4.h b/include/linux/auto_fs4.h index d998ddcf7288..0a6bc52ffe88 100644 --- a/include/linux/auto_fs4.h +++ b/include/linux/auto_fs4.h @@ -19,18 +19,37 @@ #undef AUTOFS_MIN_PROTO_VERSION #undef AUTOFS_MAX_PROTO_VERSION -#define AUTOFS_PROTO_VERSION 4 +#define AUTOFS_PROTO_VERSION 5 #define AUTOFS_MIN_PROTO_VERSION 3 -#define AUTOFS_MAX_PROTO_VERSION 4 +#define AUTOFS_MAX_PROTO_VERSION 5 -#define AUTOFS_PROTO_SUBVERSION 10 +#define AUTOFS_PROTO_SUBVERSION 0 /* Mask for expire behaviour */ #define AUTOFS_EXP_IMMEDIATE 1 #define AUTOFS_EXP_LEAVES 2 -/* New message type */ -#define autofs_ptype_expire_multi 2 /* Expire entry (umount request) */ +/* Daemon notification packet types */ +enum autofs_notify { + NFY_NONE, + NFY_MOUNT, + NFY_EXPIRE +}; + +/* Kernel protocol version 4 packet types */ + +/* Expire entry (umount request) */ +#define autofs_ptype_expire_multi 2 + +/* Kernel protocol version 5 packet types */ + +/* Indirect mount missing and expire requests. */ +#define autofs_ptype_missing_indirect 3 +#define autofs_ptype_expire_indirect 4 + +/* Direct mount missing and expire requests */ +#define autofs_ptype_missing_direct 5 +#define autofs_ptype_expire_direct 6 /* v4 multi expire (via pipe) */ struct autofs_packet_expire_multi { @@ -40,14 +59,36 @@ struct autofs_packet_expire_multi { char name[NAME_MAX+1]; }; +/* autofs v5 common packet struct */ +struct autofs_v5_packet { + struct autofs_packet_hdr hdr; + autofs_wqt_t wait_queue_token; + __u32 dev; + __u64 ino; + __u32 uid; + __u32 gid; + __u32 pid; + __u32 tgid; + __u32 len; + char name[NAME_MAX+1]; +}; + +typedef struct autofs_v5_packet autofs_packet_missing_indirect_t; +typedef struct autofs_v5_packet autofs_packet_expire_indirect_t; +typedef struct autofs_v5_packet autofs_packet_missing_direct_t; +typedef struct autofs_v5_packet autofs_packet_expire_direct_t; + union autofs_packet_union { struct autofs_packet_hdr hdr; struct autofs_packet_missing missing; struct autofs_packet_expire expire; struct autofs_packet_expire_multi expire_multi; + struct autofs_v5_packet v5_packet; }; #define AUTOFS_IOC_EXPIRE_MULTI _IOW(0x93,0x66,int) +#define AUTOFS_IOC_EXPIRE_INDIRECT AUTOFS_IOC_EXPIRE_MULTI +#define AUTOFS_IOC_EXPIRE_DIRECT AUTOFS_IOC_EXPIRE_MULTI #define AUTOFS_IOC_PROTOSUBVER _IOR(0x93,0x67,int) #define AUTOFS_IOC_ASKREGHOST _IOR(0x93,0x68,int) #define AUTOFS_IOC_TOGGLEREGHOST _IOR(0x93,0x69,int) From 44d53eb041d901620b1090590a549a705767fd10 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Mar 2006 01:14:56 -0800 Subject: [PATCH 062/306] [PATCH] autofs4: change AUTOFS_TYP_* AUTOFS_TYPE_* Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/autofs_i.h | 6 +++--- fs/autofs4/expire.c | 2 +- fs/autofs4/inode.c | 12 ++++++------ fs/autofs4/waitq.c | 6 +++--- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 37c8d909d1e9..ff6239d57b4b 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -91,9 +91,9 @@ struct autofs_wait_queue { #define AUTOFS_SBI_MAGIC 0x6d4a556d -#define AUTOFS_TYP_INDIRECT 0x0001 -#define AUTOFS_TYP_DIRECT 0x0002 -#define AUTOFS_TYP_OFFSET 0x0004 +#define AUTOFS_TYPE_INDIRECT 0x0001 +#define AUTOFS_TYPE_DIRECT 0x0002 +#define AUTOFS_TYPE_OFFSET 0x0004 struct autofs_sb_info { u32 magic; diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 8fd92eaf936d..08e33218a64a 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -424,7 +424,7 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, if (arg && get_user(do_now, arg)) return -EFAULT; - if (sbi->type & AUTOFS_TYP_DIRECT) + if (sbi->type & AUTOFS_TYPE_DIRECT) dentry = autofs4_expire_direct(sb, mnt, sbi, do_now); else dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 3801bed94e45..943888905493 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -178,9 +178,9 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, ",minproto=%d", sbi->min_proto); seq_printf(m, ",maxproto=%d", sbi->max_proto); - if (sbi->type & AUTOFS_TYP_OFFSET) + if (sbi->type & AUTOFS_TYPE_OFFSET) seq_printf(m, ",offset"); - else if (sbi->type & AUTOFS_TYP_DIRECT) + else if (sbi->type & AUTOFS_TYPE_DIRECT) seq_printf(m, ",direct"); else seq_printf(m, ",indirect"); @@ -267,13 +267,13 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, *maxproto = option; break; case Opt_indirect: - *type = AUTOFS_TYP_INDIRECT; + *type = AUTOFS_TYPE_INDIRECT; break; case Opt_direct: - *type = AUTOFS_TYP_DIRECT; + *type = AUTOFS_TYPE_DIRECT; break; case Opt_offset: - *type = AUTOFS_TYP_DIRECT | AUTOFS_TYP_OFFSET; + *type = AUTOFS_TYPE_DIRECT | AUTOFS_TYPE_OFFSET; break; default: return 1; @@ -364,7 +364,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) } root_inode->i_fop = &autofs4_root_operations; - root_inode->i_op = sbi->type & AUTOFS_TYP_DIRECT ? + root_inode->i_op = sbi->type & AUTOFS_TYPE_DIRECT ? &autofs4_direct_root_inode_operations : &autofs4_indirect_root_inode_operations; diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 12da2c977b0a..894d74671bd0 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -207,7 +207,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, return -ENOMEM; /* If this is a direct mount request create a dummy name */ - if (IS_ROOT(dentry) && (sbi->type & AUTOFS_TYP_DIRECT)) + if (IS_ROOT(dentry) && (sbi->type & AUTOFS_TYPE_DIRECT)) len = sprintf(name, "%p", dentry); else { len = autofs4_getpath(sbi, dentry, &name); @@ -283,11 +283,11 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, type = autofs_ptype_expire_multi; } else { if (notify == NFY_MOUNT) - type = (sbi->type & AUTOFS_TYP_DIRECT) ? + type = (sbi->type & AUTOFS_TYPE_DIRECT) ? autofs_ptype_missing_direct : autofs_ptype_missing_indirect; else - type = (sbi->type & AUTOFS_TYP_DIRECT) ? + type = (sbi->type & AUTOFS_TYPE_DIRECT) ? autofs_ptype_expire_direct : autofs_ptype_expire_indirect; } From 3370c74b2e147169d5bba6665e03d3281f5795ed Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 27 Mar 2006 01:14:57 -0800 Subject: [PATCH 063/306] [PATCH] Remove redundant check from autofs4_put_super We have to have a valid sbi here, or we'd have oopsed already. (There's a dereference of sbi->catatonic a few lines above) Coverity #740 Signed-off-by: Dave Jones Cc: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 943888905493..4eddee4e76fc 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -157,8 +157,7 @@ static void autofs4_put_super(struct super_block *sb) autofs4_catatonic_mode(sbi); /* Free wait queues, close pipe */ /* Clean up and release dangling references */ - if (sbi) - autofs4_force_release(sbi); + autofs4_force_release(sbi); kfree(sbi); From 871f94344cea36b2ce91231f442f9f9298529712 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Mar 2006 01:14:58 -0800 Subject: [PATCH 064/306] [PATCH] autofs4: follow_link missing functionality This functionality is also need for operation of autofs v5 direct mounts. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/expire.c | 4 ---- fs/autofs4/root.c | 50 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 08e33218a64a..b8ce02607d66 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -113,10 +113,6 @@ static int autofs4_direct_busy(struct vfsmount *mnt, DPRINTK("top %p %.*s", top, (int) top->d_name.len, top->d_name.name); - /* Not a mountpoint - give up */ - if (!d_mountpoint(top)) - return 1; - /* If it's busy update the expiry counters */ if (!may_umount_tree(mnt)) { struct autofs_info *ino = autofs4_dentry_ino(top); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 3f0048582248..c8fe43a475e2 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -57,6 +57,9 @@ struct inode_operations autofs4_indirect_root_inode_operations = { struct inode_operations autofs4_direct_root_inode_operations = { .lookup = autofs4_lookup, + .unlink = autofs4_dir_unlink, + .mkdir = autofs4_dir_mkdir, + .rmdir = autofs4_dir_rmdir, .follow_link = autofs4_follow_link, }; @@ -337,15 +340,50 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) if (oz_mode || !lookup_type) goto done; - status = try_to_fill_dentry(dentry, 0); - if (status) - goto out_error; + /* + * If a request is pending wait for it. + * If it's a mount then it won't be expired till at least + * a liitle later and if it's an expire then we might need + * to mount it again. + */ + if (autofs4_ispending(dentry)) { + DPRINTK("waiting for active request %p name=%.*s", + dentry, dentry->d_name.len, dentry->d_name.name); - if (!autofs4_follow_mount(&nd->mnt, &nd->dentry)) { - status = -ENOENT; - goto out_error; + status = autofs4_wait(sbi, dentry, NFY_NONE); + + DPRINTK("request done status=%d", status); } + /* + * If the dentry contains directories then it is an + * autofs multi-mount with no root mount offset. So + * don't try to mount it again. + */ + spin_lock(&dcache_lock); + if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { + spin_unlock(&dcache_lock); + + status = try_to_fill_dentry(dentry, 0); + if (status) + goto out_error; + + /* + * The mount succeeded but if there is no root mount + * it must be an autofs multi-mount with no root offset + * so we don't need to follow the mount. + */ + if (d_mountpoint(dentry)) { + if (!autofs4_follow_mount(&nd->mnt, &nd->dentry)) { + status = -ENOENT; + goto out_error; + } + } + + goto done; + } + spin_unlock(&dcache_lock); + done: return NULL; From 3e7b19198003fc25b11838e709f17d4fa173b2d7 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Mar 2006 01:14:59 -0800 Subject: [PATCH 065/306] [PATCH] autofs4: atomic var underflow Fix accidental underflow of the atomic counter. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/autofs_i.h | 2 +- fs/autofs4/waitq.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index ff6239d57b4b..617fd7b37447 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -85,7 +85,7 @@ struct autofs_wait_queue { pid_t tgid; /* This is for status reporting upon return */ int status; - atomic_t notified; + atomic_t notify; atomic_t wait_ctr; }; diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 894d74671bd0..142ab6aa2aa1 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -263,7 +263,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, wq->tgid = current->tgid; wq->status = -EINTR; /* Status return if interrupted */ atomic_set(&wq->wait_ctr, 2); - atomic_set(&wq->notified, 1); + atomic_set(&wq->notify, 1); mutex_unlock(&sbi->wq_mutex); } else { atomic_inc(&wq->wait_ctr); @@ -273,9 +273,11 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, (unsigned long) wq->wait_queue_token, wq->len, wq->name, notify); } - if (notify != NFY_NONE && atomic_dec_and_test(&wq->notified)) { + if (notify != NFY_NONE && atomic_read(&wq->notify)) { int type; + atomic_dec(&wq->notify); + if (sbi->version < 5) { if (notify == NFY_MOUNT) type = autofs_ptype_missing; From efc36aa5608f5717338747e152c23f2cfdb14697 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:14:59 -0800 Subject: [PATCH 066/306] [PATCH] knfsd: Change the store of auth_domains to not be a 'cache' The 'auth_domain's are simply handles on internal data structures. They do not cache information from user-space, and forcing them into the mold of a 'cache' misrepresents their true nature and causes confusion. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/export.c | 5 +- include/linux/sunrpc/svcauth.h | 12 +-- net/sunrpc/auth_gss/svcauth_gss.c | 14 ++-- net/sunrpc/sunrpc_syms.c | 4 +- net/sunrpc/svcauth.c | 120 ++++++++---------------------- net/sunrpc/svcauth_unix.c | 65 ++++++++-------- 6 files changed, 78 insertions(+), 142 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 417ec02df44f..ac0997731fce 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -242,7 +242,7 @@ static inline int svc_expkey_match (struct svc_expkey *a, struct svc_expkey *b) static inline void svc_expkey_init(struct svc_expkey *new, struct svc_expkey *item) { - cache_get(&item->ek_client->h); + kref_get(&item->ek_client->ref); new->ek_client = item->ek_client; new->ek_fsidtype = item->ek_fsidtype; new->ek_fsid[0] = item->ek_fsid[0]; @@ -474,7 +474,7 @@ static inline int svc_export_match(struct svc_export *a, struct svc_export *b) } static inline void svc_export_init(struct svc_export *new, struct svc_export *item) { - cache_get(&item->ex_client->h); + kref_get(&item->ex_client->ref); new->ex_client = item->ex_client; new->ex_dentry = dget(item->ex_dentry); new->ex_mnt = mntget(item->ex_mnt); @@ -1129,7 +1129,6 @@ exp_delclient(struct nfsctl_client *ncp) */ if (dom) { err = auth_unix_forget_old(dom); - dom->h.expiry_time = get_seconds(); auth_domain_put(dom); } diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h index c119ce7cbd22..2fe2087edd66 100644 --- a/include/linux/sunrpc/svcauth.h +++ b/include/linux/sunrpc/svcauth.h @@ -45,9 +45,10 @@ struct svc_rqst; /* forward decl */ * of ip addresses to the given client. */ struct auth_domain { - struct cache_head h; + struct kref ref; + struct hlist_node hash; char *name; - int flavour; + struct auth_ops *flavour; }; /* @@ -86,6 +87,9 @@ struct auth_domain { * * domain_release() * This call releases a domain. + * set_client() + * Givens a pending request (struct svc_rqst), finds and assigns + * an appropriate 'auth_domain' as the client. */ struct auth_ops { char * name; @@ -117,7 +121,7 @@ extern void svc_auth_unregister(rpc_authflavor_t flavor); extern struct auth_domain *unix_domain_find(char *name); extern void auth_domain_put(struct auth_domain *item); extern int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom); -extern struct auth_domain *auth_domain_lookup(struct auth_domain *item, int set); +extern struct auth_domain *auth_domain_lookup(char *name, struct auth_domain *new); extern struct auth_domain *auth_domain_find(char *name); extern struct auth_domain *auth_unix_lookup(struct in_addr addr); extern int auth_unix_forget_old(struct auth_domain *dom); @@ -160,8 +164,6 @@ static inline unsigned long hash_mem(char *buf, int length, int bits) return hash >> (BITS_PER_LONG - bits); } -extern struct cache_detail auth_domain_cache, ip_map_cache; - #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_SVCAUTH_H_ */ diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 23632d84d8d7..6b073c2e6930 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -645,6 +645,8 @@ find_gss_auth_domain(struct gss_ctx *ctx, u32 svc) return auth_domain_find(name); } +static struct auth_ops svcauthops_gss; + int svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name) { @@ -655,20 +657,18 @@ svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name) new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) goto out; - cache_init(&new->h.h); + kref_init(&new->h.ref); new->h.name = kmalloc(strlen(name) + 1, GFP_KERNEL); if (!new->h.name) goto out_free_dom; strcpy(new->h.name, name); - new->h.flavour = RPC_AUTH_GSS; + new->h.flavour = &svcauthops_gss; new->pseudoflavor = pseudoflavor; - new->h.h.expiry_time = NEVER; - test = auth_domain_lookup(&new->h, 1); - if (test == &new->h) { - BUG_ON(atomic_dec_and_test(&new->h.h.refcnt)); - } else { /* XXX Duplicate registration? */ + test = auth_domain_lookup(name, &new->h); + if (test != &new->h) { /* XXX Duplicate registration? */ auth_domain_put(&new->h); + /* dangling ref-count... */ goto out; } return 0; diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 9f7373203592..40401196e7de 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -142,6 +142,7 @@ EXPORT_SYMBOL(nlm_debug); extern int register_rpc_pipefs(void); extern void unregister_rpc_pipefs(void); +extern struct cache_detail ip_map_cache; static int __init init_sunrpc(void) @@ -158,7 +159,6 @@ init_sunrpc(void) #ifdef CONFIG_PROC_FS rpc_proc_init(); #endif - cache_register(&auth_domain_cache); cache_register(&ip_map_cache); out: return err; @@ -169,8 +169,6 @@ cleanup_sunrpc(void) { unregister_rpc_pipefs(); rpc_destroy_mempool(); - if (cache_unregister(&auth_domain_cache)) - printk(KERN_ERR "sunrpc: failed to unregister auth_domain cache\n"); if (cache_unregister(&ip_map_cache)) printk(KERN_ERR "sunrpc: failed to unregister ip_map cache\n"); #ifdef RPC_DEBUG diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index dda4f0c63511..5b28c6176806 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -106,112 +106,56 @@ svc_auth_unregister(rpc_authflavor_t flavor) EXPORT_SYMBOL(svc_auth_unregister); /************************************************** - * cache for domain name to auth_domain - * Entries are only added by flavours which will normally - * have a structure that 'inherits' from auth_domain. - * e.g. when an IP -> domainname is given to auth_unix, - * and the domain name doesn't exist, it will create a - * auth_unix_domain and add it to this hash table. - * If it finds the name does exist, but isn't AUTH_UNIX, - * it will complain. + * 'auth_domains' are stored in a hash table indexed by name. + * When the last reference to an 'auth_domain' is dropped, + * the object is unhashed and freed. + * If auth_domain_lookup fails to find an entry, it will return + * it's second argument 'new'. If this is non-null, it will + * have been atomically linked into the table. */ -/* - * Auth auth_domain cache is somewhat different to other caches, - * largely because the entries are possibly of different types: - * each auth flavour has it's own type. - * One consequence of this that DefineCacheLookup cannot - * allocate a new structure as it cannot know the size. - * Notice that the "INIT" code fragment is quite different - * from other caches. When auth_domain_lookup might be - * creating a new domain, the new domain is passed in - * complete and it is used as-is rather than being copied into - * another structure. - */ #define DN_HASHBITS 6 #define DN_HASHMAX (1<flavour]->domain_release(dom); -} - - -struct cache_detail auth_domain_cache = { - .owner = THIS_MODULE, - .hash_size = DN_HASHMAX, - .hash_table = auth_domain_table, - .name = "auth.domain", - .cache_put = auth_domain_drop, -}; +static struct hlist_head auth_domain_table[DN_HASHMAX]; +static spinlock_t auth_domain_lock = SPIN_LOCK_UNLOCKED; void auth_domain_put(struct auth_domain *dom) { - auth_domain_drop(&dom->h, &auth_domain_cache); -} - -static inline int auth_domain_hash(struct auth_domain *item) -{ - return hash_str(item->name, DN_HASHBITS); -} -static inline int auth_domain_match(struct auth_domain *tmp, struct auth_domain *item) -{ - return strcmp(tmp->name, item->name) == 0; + if (atomic_dec_and_lock(&dom->ref.refcount, &auth_domain_lock)) { + hlist_del(&dom->hash); + dom->flavour->domain_release(dom); + } } struct auth_domain * -auth_domain_lookup(struct auth_domain *item, int set) +auth_domain_lookup(char *name, struct auth_domain *new) { - struct auth_domain *tmp = NULL; - struct cache_head **hp, **head; - head = &auth_domain_cache.hash_table[auth_domain_hash(item)]; + struct auth_domain *hp; + struct hlist_head *head; + struct hlist_node *np; - if (set) - write_lock(&auth_domain_cache.hash_lock); - else - read_lock(&auth_domain_cache.hash_lock); - for (hp=head; *hp != NULL; hp = &tmp->h.next) { - tmp = container_of(*hp, struct auth_domain, h); - if (!auth_domain_match(tmp, item)) - continue; - if (!set) { - cache_get(&tmp->h); - goto out_noset; + head = &auth_domain_table[hash_str(name, DN_HASHBITS)]; + + spin_lock(&auth_domain_lock); + + hlist_for_each_entry(hp, np, head, hash) { + if (strcmp(hp->name, name)==0) { + kref_get(&hp->ref); + spin_unlock(&auth_domain_lock); + return hp; } - *hp = tmp->h.next; - tmp->h.next = NULL; - auth_domain_drop(&tmp->h, &auth_domain_cache); - goto out_set; } - /* Didn't find anything */ - if (!set) - goto out_nada; - auth_domain_cache.entries++; -out_set: - item->h.next = *head; - *head = &item->h; - cache_get(&item->h); - write_unlock(&auth_domain_cache.hash_lock); - cache_fresh(&auth_domain_cache, &item->h, item->h.expiry_time); - cache_get(&item->h); - return item; -out_nada: - tmp = NULL; -out_noset: - read_unlock(&auth_domain_cache.hash_lock); - return tmp; + if (new) { + hlist_add_head(&new->hash, head); + kref_get(&new->ref); + } + spin_unlock(&auth_domain_lock); + return new; } struct auth_domain *auth_domain_find(char *name) { - struct auth_domain *rv, ad; - - ad.name = name; - rv = auth_domain_lookup(&ad, 0); - return rv; + return auth_domain_lookup(name, NULL); } diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 3e6c694bbad1..17e8b2a3130c 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -27,41 +27,35 @@ struct unix_domain { /* other stuff later */ }; +extern struct auth_ops svcauth_unix; + struct auth_domain *unix_domain_find(char *name) { - struct auth_domain *rv, ud; - struct unix_domain *new; + struct auth_domain *rv; + struct unix_domain *new = NULL; - ud.name = name; - - rv = auth_domain_lookup(&ud, 0); + rv = auth_domain_lookup(name, NULL); + while(1) { + if (rv != &new->h) { + if (new) auth_domain_put(&new->h); + return rv; + } + if (rv && rv->flavour != &svcauth_unix) { + auth_domain_put(rv); + return NULL; + } + if (rv) + return rv; - foundit: - if (rv && rv->flavour != RPC_AUTH_UNIX) { - auth_domain_put(rv); - return NULL; + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (new == NULL) + return NULL; + kref_init(&new->h.ref); + new->h.name = kstrdup(name, GFP_KERNEL); + new->h.flavour = &svcauth_unix; + new->addr_changes = 0; + rv = auth_domain_lookup(name, &new->h); } - if (rv) - return rv; - - new = kmalloc(sizeof(*new), GFP_KERNEL); - if (new == NULL) - return NULL; - cache_init(&new->h.h); - new->h.name = kstrdup(name, GFP_KERNEL); - new->h.flavour = RPC_AUTH_UNIX; - new->addr_changes = 0; - new->h.h.expiry_time = NEVER; - - rv = auth_domain_lookup(&new->h, 2); - if (rv == &new->h) { - if (atomic_dec_and_test(&new->h.h.refcnt)) BUG(); - } else { - auth_domain_put(&new->h); - goto foundit; - } - - return rv; } static void svcauth_unix_domain_release(struct auth_domain *dom) @@ -130,7 +124,7 @@ static inline void ip_map_init(struct ip_map *new, struct ip_map *item) } static inline void ip_map_update(struct ip_map *new, struct ip_map *item) { - cache_get(&item->m_client->h.h); + kref_get(&item->m_client->h.ref); new->m_client = item->m_client; new->m_add_change = item->m_add_change; } @@ -272,7 +266,7 @@ int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom) struct unix_domain *udom; struct ip_map ip, *ipmp; - if (dom->flavour != RPC_AUTH_UNIX) + if (dom->flavour != &svcauth_unix) return -EINVAL; udom = container_of(dom, struct unix_domain, h); strcpy(ip.m_class, "nfsd"); @@ -295,7 +289,7 @@ int auth_unix_forget_old(struct auth_domain *dom) { struct unix_domain *udom; - if (dom->flavour != RPC_AUTH_UNIX) + if (dom->flavour != &svcauth_unix) return -EINVAL; udom = container_of(dom, struct unix_domain, h); udom->addr_changes++; @@ -323,7 +317,7 @@ struct auth_domain *auth_unix_lookup(struct in_addr addr) rv = NULL; } else { rv = &ipm->m_client->h; - cache_get(&rv->h); + kref_get(&rv->ref); } ip_map_put(&ipm->h, &ip_map_cache); return rv; @@ -332,7 +326,6 @@ struct auth_domain *auth_unix_lookup(struct in_addr addr) void svcauth_unix_purge(void) { cache_purge(&ip_map_cache); - cache_purge(&auth_domain_cache); } static int @@ -361,7 +354,7 @@ svcauth_unix_set_client(struct svc_rqst *rqstp) return SVC_DENIED; case 0: rqstp->rq_client = &ipm->m_client->h; - cache_get(&rqstp->rq_client->h); + kref_get(&rqstp->rq_client->ref); ip_map_put(&ipm->h, &ip_map_cache); break; } From eab7e2e647c348b418e8715ecaca0177e1b473c7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:15:00 -0800 Subject: [PATCH 067/306] [PATCH] knfsd: Break the hard linkage from svc_expkey to svc_export Current svc_expkey holds a pointer to the svc_export structure, so updates to that structure have to be in-place, which is a wart on the whole cache infrastruct. So we break that linkage and just do a second lookup. If this became a performance issue, it would be possible to put a direct link back in which was only used conditionally. i.e. when an object is replaced in the cache, we set a flag in the old object. When dereferencing the link from svc_expkey, if the flag is set, we drop the reference and do a fresh lookup. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/export.c | 60 ++++++++++++++++++++++++------------- include/linux/nfsd/export.h | 20 +++---------- 2 files changed, 44 insertions(+), 36 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index ac0997731fce..587829ed651c 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -73,8 +73,10 @@ void expkey_put(struct cache_head *item, struct cache_detail *cd) if (cache_put(item, cd)) { struct svc_expkey *key = container_of(item, struct svc_expkey, h); if (test_bit(CACHE_VALID, &item->flags) && - !test_bit(CACHE_NEGATIVE, &item->flags)) - exp_put(key->ek_export); + !test_bit(CACHE_NEGATIVE, &item->flags)) { + dput(key->ek_dentry); + mntput(key->ek_mnt); + } auth_domain_put(key->ek_client); kfree(key); } @@ -164,26 +166,18 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) } else { struct nameidata nd; struct svc_expkey *ek; - struct svc_export *exp; err = path_lookup(buf, 0, &nd); if (err) goto out; dprintk("Found the path %s\n", buf); - exp = exp_get_by_name(dom, nd.mnt, nd.dentry, NULL); - - err = -ENOENT; - if (!exp) - goto out_nd; - key.ek_export = exp; - dprintk("And found export\n"); + key.ek_mnt = nd.mnt; + key.ek_dentry = nd.dentry; ek = svc_expkey_lookup(&key, 1); if (ek) expkey_put(&ek->h, &svc_expkey_cache); - exp_put(exp); err = 0; - out_nd: path_release(&nd); } cache_flush(); @@ -214,7 +208,7 @@ static int expkey_show(struct seq_file *m, if (test_bit(CACHE_VALID, &h->flags) && !test_bit(CACHE_NEGATIVE, &h->flags)) { seq_printf(m, " "); - seq_path(m, ek->ek_export->ex_mnt, ek->ek_export->ex_dentry, "\\ \t\n"); + seq_path(m, ek->ek_mnt, ek->ek_dentry, "\\ \t\n"); } seq_printf(m, "\n"); return 0; @@ -252,8 +246,8 @@ static inline void svc_expkey_init(struct svc_expkey *new, struct svc_expkey *it static inline void svc_expkey_update(struct svc_expkey *new, struct svc_expkey *item) { - cache_get(&item->ek_export->h); - new->ek_export = item->ek_export; + new->ek_mnt = mntget(item->ek_mnt); + new->ek_dentry = dget(item->ek_dentry); } static DefineSimpleCacheLookup(svc_expkey,0) /* no inplace updates */ @@ -519,7 +513,8 @@ static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv, key.ek_client = clp; key.ek_fsidtype = fsid_type; memcpy(key.ek_fsid, fsidv, key_len(fsid_type)); - key.ek_export = exp; + key.ek_mnt = exp->ex_mnt; + key.ek_dentry = exp->ex_dentry; key.h.expiry_time = NEVER; key.h.flags = 0; @@ -741,8 +736,8 @@ exp_export(struct nfsctl_export *nxp) if ((nxp->ex_flags & NFSEXP_FSID) && (fsid_key = exp_get_fsid_key(clp, nxp->ex_dev)) && !IS_ERR(fsid_key) && - fsid_key->ek_export && - fsid_key->ek_export != exp) + fsid_key->ek_mnt && + (fsid_key->ek_mnt != nd.mnt || fsid_key->ek_dentry != nd.dentry) ) goto finish; if (exp) { @@ -912,6 +907,24 @@ out: return err; } +struct svc_export * +exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv, + struct cache_req *reqp) +{ + struct svc_export *exp; + struct svc_expkey *ek = exp_find_key(clp, fsid_type, fsidv, reqp); + if (!ek || IS_ERR(ek)) + return ERR_PTR(PTR_ERR(ek)); + + exp = exp_get_by_name(clp, ek->ek_mnt, ek->ek_dentry, reqp); + expkey_put(&ek->h, &svc_expkey_cache); + + if (!exp || IS_ERR(exp)) + return ERR_PTR(PTR_ERR(exp)); + return exp; +} + + /* * Called when we need the filehandle for the root of the pseudofs, * for a given NFSv4 client. The root is defined to be the @@ -922,6 +935,7 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp, struct cache_req *creq) { struct svc_expkey *fsid_key; + struct svc_export *exp; int rv; u32 fsidv[2]; @@ -933,8 +947,14 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp, if (!fsid_key || IS_ERR(fsid_key)) return nfserr_perm; - rv = fh_compose(fhp, fsid_key->ek_export, - fsid_key->ek_export->ex_dentry, NULL); + exp = exp_get_by_name(clp, fsid_key->ek_mnt, fsid_key->ek_dentry, creq); + if (exp == NULL) + rv = nfserr_perm; + else if (IS_ERR(exp)) + rv = nfserrno(PTR_ERR(exp)); + else + rv = fh_compose(fhp, exp, + fsid_key->ek_dentry, NULL); expkey_put(&fsid_key->h, &svc_expkey_cache); return rv; } diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h index 6bad4766d3d9..d52e0b7ad37b 100644 --- a/include/linux/nfsd/export.h +++ b/include/linux/nfsd/export.h @@ -67,7 +67,8 @@ struct svc_expkey { int ek_fsidtype; u32 ek_fsid[3]; - struct svc_export * ek_export; + struct vfsmount * ek_mnt; + struct dentry * ek_dentry; }; #define EX_SECURE(exp) (!((exp)->ex_flags & NFSEXP_INSECURE_PORT)) @@ -114,22 +115,9 @@ static inline void exp_get(struct svc_export *exp) { cache_get(&exp->h); } -static inline struct svc_export * +extern struct svc_export * exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv, - struct cache_req *reqp) -{ - struct svc_expkey *ek = exp_find_key(clp, fsid_type, fsidv, reqp); - if (ek && !IS_ERR(ek)) { - struct svc_export *exp = ek->ek_export; - int err; - exp_get(exp); - expkey_put(&ek->h, &svc_expkey_cache); - if ((err = cache_check(&svc_export_cache, &exp->h, reqp))) - exp = ERR_PTR(err); - return exp; - } else - return ERR_PTR(PTR_ERR(ek)); -} + struct cache_req *reqp); #endif /* __KERNEL__ */ From 7d317f2c9f1e9dcf4f632fa98f91d1d4a36c4cae Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:15:01 -0800 Subject: [PATCH 068/306] [PATCH] knfsd: Get rid of 'inplace' sunrpc caches These were an unnecessary wart. Also only have one 'DefineSimpleCache..' instead of two. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/export.c | 4 ++-- fs/nfsd/nfs4idmap.c | 10 ++-------- include/linux/sunrpc/cache.h | 28 +++++++++++----------------- net/sunrpc/auth_gss/svcauth_gss.c | 4 ++-- net/sunrpc/svcauth_unix.c | 2 +- 5 files changed, 18 insertions(+), 30 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 587829ed651c..c591761a1ad6 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -250,7 +250,7 @@ static inline void svc_expkey_update(struct svc_expkey *new, struct svc_expkey * new->ek_dentry = dget(item->ek_dentry); } -static DefineSimpleCacheLookup(svc_expkey,0) /* no inplace updates */ +static DefineSimpleCacheLookup(svc_expkey, svc_expkey) #define EXPORT_HASHBITS 8 #define EXPORT_HASHMAX (1<< EXPORT_HASHBITS) @@ -482,7 +482,7 @@ static inline void svc_export_update(struct svc_export *new, struct svc_export * new->ex_fsid = item->ex_fsid; } -static DefineSimpleCacheLookup(svc_export,1) /* allow inplace updates */ +static DefineSimpleCacheLookup(svc_export, svc_export) struct svc_expkey * diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 13369650cdf9..dea690aa8bb5 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -76,12 +76,6 @@ struct ent { char authname[IDMAP_NAMESZ]; }; -#define DefineSimpleCacheLookupMap(STRUCT, FUNC) \ - DefineCacheLookup(struct STRUCT, h, FUNC##_lookup, \ - (struct STRUCT *item, int set), /*no setup */, \ - & FUNC##_cache, FUNC##_hash(item), FUNC##_match(item, tmp), \ - STRUCT##_init(new, item), STRUCT##_update(tmp, item), 0) - /* Common entry handling */ #define ENT_HASHBITS 8 @@ -264,7 +258,7 @@ out: return error; } -static DefineSimpleCacheLookupMap(ent, idtoname); +static DefineSimpleCacheLookup(ent, idtoname); /* * Name -> ID cache @@ -390,7 +384,7 @@ out: return (error); } -static DefineSimpleCacheLookupMap(ent, nametoid); +static DefineSimpleCacheLookup(ent, nametoid); /* * Exported API diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index c4e3ea7cf154..405ac14e509a 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -133,14 +133,11 @@ struct cache_deferred_req { * If "set" == 0 : * If an entry is found, it is returned * If no entry is found, a new non-VALID entry is created. - * If "set" == 1 and INPLACE == 0 : + * If "set" == 1 : * If no entry is found a new one is inserted with data from "template" * If a non-CACHE_VALID entry is found, it is updated from template using UPDATE * If a CACHE_VALID entry is found, a new entry is swapped in with data * from "template" - * If set == 1, and INPLACE == 1 : - * As above, except that if a CACHE_VALID entry is found, we UPDATE in place - * instead of swapping in a new entry. * * If the passed handle has the CACHE_NEGATIVE flag set, then UPDATE is not * run but insteead CACHE_NEGATIVE is set in any new item. @@ -159,13 +156,8 @@ struct cache_deferred_req { * TEST tests if "tmp" matches "item" * INIT copies key information from "item" to "new" * UPDATE copies content information from "item" to "tmp" - * INPLACE is true if updates can happen inplace rather than allocating a new structure - * - * WARNING: any substantial changes to this must be reflected in - * net/sunrpc/svcauth.c(auth_domain_lookup) - * which is a similar routine that is open-coded. */ -#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,SETUP,DETAIL,HASHFN,TEST,INIT,UPDATE,INPLACE) \ +#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,SETUP,DETAIL,HASHFN,TEST,INIT,UPDATE) \ RTN *FNAME ARGS \ { \ RTN *tmp, *new=NULL; \ @@ -179,13 +171,13 @@ RTN *FNAME ARGS \ tmp = container_of(*hp, RTN, MEMBER); \ if (TEST) { /* found a match */ \ \ - if (set && !INPLACE && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new) \ + if (set && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new) \ break; \ \ if (new) \ {INIT;} \ if (set) { \ - if (!INPLACE && test_bit(CACHE_VALID, &tmp->MEMBER.flags))\ + if (test_bit(CACHE_VALID, &tmp->MEMBER.flags))\ { /* need to swap in new */ \ RTN *t2; \ \ @@ -206,7 +198,7 @@ RTN *FNAME ARGS \ else read_unlock(&(DETAIL)->hash_lock); \ if (set) \ cache_fresh(DETAIL, &tmp->MEMBER, item->MEMBER.expiry_time); \ - if (set && !INPLACE && new) cache_fresh(DETAIL, &new->MEMBER, 0); \ + if (set && new) cache_fresh(DETAIL, &new->MEMBER, 0); \ if (new) (DETAIL)->cache_put(&new->MEMBER, DETAIL); \ return tmp; \ } \ @@ -239,10 +231,12 @@ RTN *FNAME ARGS \ return NULL; \ } -#define DefineSimpleCacheLookup(STRUCT,INPLACE) \ - DefineCacheLookup(struct STRUCT, h, STRUCT##_lookup, (struct STRUCT *item, int set), /*no setup */, \ - & STRUCT##_cache, STRUCT##_hash(item), STRUCT##_match(item, tmp),\ - STRUCT##_init(new, item), STRUCT##_update(tmp, item),INPLACE) +#define DefineSimpleCacheLookup(STRUCT, FUNC) \ + DefineCacheLookup(struct STRUCT, h, FUNC##_lookup, \ + (struct STRUCT *item, int set), /*no setup */, \ + & FUNC##_cache, FUNC##_hash(item), FUNC##_match(item, tmp), \ + STRUCT##_init(new, item), STRUCT##_update(tmp, item)) + #define cache_for_each(pos, detail, index, member) \ for (({read_lock(&(detail)->hash_lock); index = (detail)->hash_size;}) ; \ diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 6b073c2e6930..aadb4e8d6aa7 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -259,7 +259,7 @@ static struct cache_detail rsi_cache = { .cache_parse = rsi_parse, }; -static DefineSimpleCacheLookup(rsi, 0) +static DefineSimpleCacheLookup(rsi, rsi) /* * The rpcsec_context cache is used to store a context that is @@ -446,7 +446,7 @@ static struct cache_detail rsc_cache = { .cache_parse = rsc_parse, }; -static DefineSimpleCacheLookup(rsc, 0); +static DefineSimpleCacheLookup(rsc, rsc); static struct rsc * gss_svc_searchbyctx(struct xdr_netobj *handle) diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 17e8b2a3130c..7ddf068b5b25 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -258,7 +258,7 @@ struct cache_detail ip_map_cache = { .cache_show = ip_map_show, }; -static DefineSimpleCacheLookup(ip_map, 0) +static DefineSimpleCacheLookup(ip_map, ip_map) int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom) From 15a5f6bd23eddd5b3be80366f364be04fb1c1c99 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:15:02 -0800 Subject: [PATCH 069/306] [PATCH] knfsd: Create cache_lookup function instead of using a macro to declare one The C++-like 'template' approach proves to be too ugly and hard to work with. The old 'template' won't go away until all users are updated. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sunrpc/cache.h | 12 +++++ net/sunrpc/cache.c | 98 ++++++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 405ac14e509a..3e17a5ff1dea 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -81,6 +81,11 @@ struct cache_detail { struct cache_detail *cd, struct cache_head *h); + struct cache_head * (*alloc)(void); + int (*match)(struct cache_head *orig, struct cache_head *new); + void (*init)(struct cache_head *orig, struct cache_head *new); + void (*update)(struct cache_head *orig, struct cache_head *new); + /* fields below this comment are for internal use * and should not be touched by cache owners */ @@ -237,6 +242,13 @@ RTN *FNAME ARGS \ & FUNC##_cache, FUNC##_hash(item), FUNC##_match(item, tmp), \ STRUCT##_init(new, item), STRUCT##_update(tmp, item)) +extern struct cache_head * +sunrpc_cache_lookup(struct cache_detail *detail, + struct cache_head *key, int hash); +extern struct cache_head * +sunrpc_cache_update(struct cache_detail *detail, + struct cache_head *new, struct cache_head *old, int hash); + #define cache_for_each(pos, detail, index, member) \ for (({read_lock(&(detail)->hash_lock); index = (detail)->hash_size;}) ; \ diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 0acccfeeb284..4449dc52edf5 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -47,6 +47,104 @@ void cache_init(struct cache_head *h) h->last_refresh = now; } +struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, + struct cache_head *key, int hash) +{ + struct cache_head **head, **hp; + struct cache_head *new = NULL; + + head = &detail->hash_table[hash]; + + read_lock(&detail->hash_lock); + + for (hp=head; *hp != NULL ; hp = &(*hp)->next) { + struct cache_head *tmp = *hp; + if (detail->match(tmp, key)) { + cache_get(tmp); + read_unlock(&detail->hash_lock); + return tmp; + } + } + read_unlock(&detail->hash_lock); + /* Didn't find anything, insert an empty entry */ + + new = detail->alloc(); + if (!new) + return NULL; + cache_init(new); + + write_lock(&detail->hash_lock); + + /* check if entry appeared while we slept */ + for (hp=head; *hp != NULL ; hp = &(*hp)->next) { + struct cache_head *tmp = *hp; + if (detail->match(tmp, key)) { + cache_get(tmp); + write_unlock(&detail->hash_lock); + detail->cache_put(new, detail); + return tmp; + } + } + detail->init(new, key); + new->next = *head; + *head = new; + detail->entries++; + cache_get(new); + write_unlock(&detail->hash_lock); + + return new; +} +EXPORT_SYMBOL(sunrpc_cache_lookup); + +struct cache_head *sunrpc_cache_update(struct cache_detail *detail, + struct cache_head *new, struct cache_head *old, int hash) +{ + /* The 'old' entry is to be replaced by 'new'. + * If 'old' is not VALID, we update it directly, + * otherwise we need to replace it + */ + struct cache_head **head; + struct cache_head *tmp; + + if (!test_bit(CACHE_VALID, &old->flags)) { + write_lock(&detail->hash_lock); + if (!test_bit(CACHE_VALID, &old->flags)) { + if (test_bit(CACHE_NEGATIVE, &new->flags)) + set_bit(CACHE_NEGATIVE, &old->flags); + else + detail->update(old, new); + /* FIXME cache_fresh should come first */ + write_unlock(&detail->hash_lock); + cache_fresh(detail, old, new->expiry_time); + return old; + } + write_unlock(&detail->hash_lock); + } + /* We need to insert a new entry */ + tmp = detail->alloc(); + if (!tmp) { + detail->cache_put(old, detail); + return NULL; + } + cache_init(tmp); + detail->init(tmp, old); + head = &detail->hash_table[hash]; + + write_lock(&detail->hash_lock); + if (test_bit(CACHE_NEGATIVE, &new->flags)) + set_bit(CACHE_NEGATIVE, &tmp->flags); + else + detail->update(tmp, new); + tmp->next = *head; + *head = tmp; + cache_get(tmp); + write_unlock(&detail->hash_lock); + cache_fresh(detail, tmp, new->expiry_time); + cache_fresh(detail, old, 0); + detail->cache_put(old, detail); + return tmp; +} +EXPORT_SYMBOL(sunrpc_cache_update); static int cache_make_upcall(struct cache_detail *detail, struct cache_head *h); /* From 1a9917c2da5b81fa536e560be0c431435e2c965b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:15:02 -0800 Subject: [PATCH 070/306] [PATCH] knfsd: Convert ip_map cache to use the new lookup routine Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/sunrpc/svcauth_unix.c | 148 +++++++++++++++++++++++++------------- 1 file changed, 98 insertions(+), 50 deletions(-) diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 7ddf068b5b25..7e38621a20b7 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -106,28 +106,38 @@ static inline int hash_ip(unsigned long ip) return (hash ^ (hash>>8)) & 0xff; } #endif +static int ip_map_match(struct cache_head *corig, struct cache_head *cnew) +{ + struct ip_map *orig = container_of(corig, struct ip_map, h); + struct ip_map *new = container_of(cnew, struct ip_map, h); + return strcmp(orig->m_class, new->m_class) == 0 + && orig->m_addr.s_addr == new->m_addr.s_addr; +} +static void ip_map_init(struct cache_head *cnew, struct cache_head *citem) +{ + struct ip_map *new = container_of(cnew, struct ip_map, h); + struct ip_map *item = container_of(citem, struct ip_map, h); -static inline int ip_map_hash(struct ip_map *item) -{ - return hash_str(item->m_class, IP_HASHBITS) ^ - hash_ip((unsigned long)item->m_addr.s_addr); -} -static inline int ip_map_match(struct ip_map *item, struct ip_map *tmp) -{ - return strcmp(tmp->m_class, item->m_class) == 0 - && tmp->m_addr.s_addr == item->m_addr.s_addr; -} -static inline void ip_map_init(struct ip_map *new, struct ip_map *item) -{ strcpy(new->m_class, item->m_class); new->m_addr.s_addr = item->m_addr.s_addr; } -static inline void ip_map_update(struct ip_map *new, struct ip_map *item) +static void update(struct cache_head *cnew, struct cache_head *citem) { + struct ip_map *new = container_of(cnew, struct ip_map, h); + struct ip_map *item = container_of(citem, struct ip_map, h); + kref_get(&item->m_client->h.ref); new->m_client = item->m_client; new->m_add_change = item->m_add_change; } +static struct cache_head *ip_map_alloc(void) +{ + struct ip_map *i = kmalloc(sizeof(*i), GFP_KERNEL); + if (i) + return &i->h; + else + return NULL; +} static void ip_map_request(struct cache_detail *cd, struct cache_head *h, @@ -148,7 +158,8 @@ static void ip_map_request(struct cache_detail *cd, (*bpp)[-1] = '\n'; } -static struct ip_map *ip_map_lookup(struct ip_map *, int); +static struct ip_map *ip_map_lookup(char *class, struct in_addr addr); +static int ip_map_update(struct ip_map *ipm, struct unix_domain *udom, time_t expiry); static int ip_map_parse(struct cache_detail *cd, char *mesg, int mlen) @@ -160,7 +171,11 @@ static int ip_map_parse(struct cache_detail *cd, int len; int b1,b2,b3,b4; char c; - struct ip_map ipm, *ipmp; + char class[8]; + struct in_addr addr; + int err; + + struct ip_map *ipmp; struct auth_domain *dom; time_t expiry; @@ -169,7 +184,7 @@ static int ip_map_parse(struct cache_detail *cd, mesg[mlen-1] = 0; /* class */ - len = qword_get(&mesg, ipm.m_class, sizeof(ipm.m_class)); + len = qword_get(&mesg, class, sizeof(class)); if (len <= 0) return -EINVAL; /* ip address */ @@ -194,25 +209,22 @@ static int ip_map_parse(struct cache_detail *cd, } else dom = NULL; - ipm.m_addr.s_addr = + addr.s_addr = htonl((((((b1<<8)|b2)<<8)|b3)<<8)|b4); - ipm.h.flags = 0; - if (dom) { - ipm.m_client = container_of(dom, struct unix_domain, h); - ipm.m_add_change = ipm.m_client->addr_changes; - } else - set_bit(CACHE_NEGATIVE, &ipm.h.flags); - ipm.h.expiry_time = expiry; - ipmp = ip_map_lookup(&ipm, 1); - if (ipmp) - ip_map_put(&ipmp->h, &ip_map_cache); + ipmp = ip_map_lookup(class,addr); + if (ipmp) { + err = ip_map_update(ipmp, + container_of(dom, struct unix_domain, h), + expiry); + } else + err = -ENOMEM; + if (dom) auth_domain_put(dom); - if (!ipmp) - return -ENOMEM; + cache_flush(); - return 0; + return err; } static int ip_map_show(struct seq_file *m, @@ -256,32 +268,70 @@ struct cache_detail ip_map_cache = { .cache_request = ip_map_request, .cache_parse = ip_map_parse, .cache_show = ip_map_show, + .match = ip_map_match, + .init = ip_map_init, + .update = update, + .alloc = ip_map_alloc, }; -static DefineSimpleCacheLookup(ip_map, ip_map) +static struct ip_map *ip_map_lookup(char *class, struct in_addr addr) +{ + struct ip_map ip; + struct cache_head *ch; + strcpy(ip.m_class, class); + ip.m_addr = addr; + ch = sunrpc_cache_lookup(&ip_map_cache, &ip.h, + hash_str(class, IP_HASHBITS) ^ + hash_ip((unsigned long)addr.s_addr)); + + if (ch) + return container_of(ch, struct ip_map, h); + else + return NULL; +} + +static int ip_map_update(struct ip_map *ipm, struct unix_domain *udom, time_t expiry) +{ + struct ip_map ip; + struct cache_head *ch; + + ip.m_client = udom; + ip.h.flags = 0; + if (!udom) + set_bit(CACHE_NEGATIVE, &ip.h.flags); + else { + ip.m_add_change = udom->addr_changes; + /* if this is from the legacy set_client system call, + * we need m_add_change to be one higher + */ + if (expiry == NEVER) + ip.m_add_change++; + } + ip.h.expiry_time = expiry; + ch = sunrpc_cache_update(&ip_map_cache, + &ip.h, &ipm->h, + hash_str(ipm->m_class, IP_HASHBITS) ^ + hash_ip((unsigned long)ipm->m_addr.s_addr)); + if (!ch) + return -ENOMEM; + ip_map_put(ch, &ip_map_cache); + return 0; +} int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom) { struct unix_domain *udom; - struct ip_map ip, *ipmp; + struct ip_map *ipmp; if (dom->flavour != &svcauth_unix) return -EINVAL; udom = container_of(dom, struct unix_domain, h); - strcpy(ip.m_class, "nfsd"); - ip.m_addr = addr; - ip.m_client = udom; - ip.m_add_change = udom->addr_changes+1; - ip.h.flags = 0; - ip.h.expiry_time = NEVER; - - ipmp = ip_map_lookup(&ip, 1); + ipmp = ip_map_lookup("nfsd", addr); - if (ipmp) { - ip_map_put(&ipmp->h, &ip_map_cache); - return 0; - } else + if (ipmp) + return ip_map_update(ipmp, udom, NEVER); + else return -ENOMEM; } @@ -304,7 +354,7 @@ struct auth_domain *auth_unix_lookup(struct in_addr addr) strcpy(key.m_class, "nfsd"); key.m_addr = addr; - ipm = ip_map_lookup(&key, 0); + ipm = ip_map_lookup("nfsd", addr); if (!ipm) return NULL; @@ -331,16 +381,14 @@ void svcauth_unix_purge(void) static int svcauth_unix_set_client(struct svc_rqst *rqstp) { - struct ip_map key, *ipm; + struct ip_map *ipm; rqstp->rq_client = NULL; if (rqstp->rq_proc == 0) return SVC_OK; - strcpy(key.m_class, rqstp->rq_server->sv_program->pg_class); - key.m_addr = rqstp->rq_addr.sin_addr; - - ipm = ip_map_lookup(&key, 0); + ipm = ip_map_lookup(rqstp->rq_server->sv_program->pg_class, + rqstp->rq_addr.sin_addr); if (ipm == NULL) return SVC_DENIED; From 4f7774c3a0558526c0faaf525581f2029480b313 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:15:03 -0800 Subject: [PATCH 071/306] [PATCH] knfsd: Use new cache_lookup for svc_export Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/export.c | 125 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 88 insertions(+), 37 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index c591761a1ad6..2ebd77d758bc 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -258,16 +258,6 @@ static DefineSimpleCacheLookup(svc_expkey, svc_expkey) static struct cache_head *export_table[EXPORT_HASHMAX]; -static inline int svc_export_hash(struct svc_export *item) -{ - int rv; - - rv = hash_ptr(item->ex_client, EXPORT_HASHBITS); - rv ^= hash_ptr(item->ex_dentry, EXPORT_HASHBITS); - rv ^= hash_ptr(item->ex_mnt, EXPORT_HASHBITS); - return rv; -} - void svc_export_put(struct cache_head *item, struct cache_detail *cd) { if (cache_put(item, cd)) { @@ -298,7 +288,8 @@ static void svc_export_request(struct cache_detail *cd, (*bpp)[-1] = '\n'; } -static struct svc_export *svc_export_lookup(struct svc_export *, int); +struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); +static struct svc_export *svc_export_lookup(struct svc_export *); static int check_export(struct inode *inode, int flags) { @@ -411,11 +402,16 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) if (err) goto out; } - expp = svc_export_lookup(&exp, 1); + expp = svc_export_lookup(&exp); if (expp) - exp_put(expp); - err = 0; + expp = svc_export_update(&exp, expp); + else + err = -ENOMEM; cache_flush(); + if (expp == NULL) + err = -ENOMEM; + else + exp_put(expp); out: if (nd.dentry) path_release(&nd); @@ -449,6 +445,46 @@ static int svc_export_show(struct seq_file *m, seq_puts(m, ")\n"); return 0; } +static int svc_export_match(struct cache_head *a, struct cache_head *b) +{ + struct svc_export *orig = container_of(a, struct svc_export, h); + struct svc_export *new = container_of(b, struct svc_export, h); + return orig->ex_client == new->ex_client && + orig->ex_dentry == new->ex_dentry && + orig->ex_mnt == new->ex_mnt; +} + +static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) +{ + struct svc_export *new = container_of(cnew, struct svc_export, h); + struct svc_export *item = container_of(citem, struct svc_export, h); + + kref_get(&item->ex_client->ref); + new->ex_client = item->ex_client; + new->ex_dentry = dget(item->ex_dentry); + new->ex_mnt = mntget(item->ex_mnt); +} + +static void export_update(struct cache_head *cnew, struct cache_head *citem) +{ + struct svc_export *new = container_of(cnew, struct svc_export, h); + struct svc_export *item = container_of(citem, struct svc_export, h); + + new->ex_flags = item->ex_flags; + new->ex_anon_uid = item->ex_anon_uid; + new->ex_anon_gid = item->ex_anon_gid; + new->ex_fsid = item->ex_fsid; +} + +static struct cache_head *svc_export_alloc(void) +{ + struct svc_export *i = kmalloc(sizeof(*i), GFP_KERNEL); + if (i) + return &i->h; + else + return NULL; +} + struct cache_detail svc_export_cache = { .owner = THIS_MODULE, .hash_size = EXPORT_HASHMAX, @@ -458,31 +494,46 @@ struct cache_detail svc_export_cache = { .cache_request = svc_export_request, .cache_parse = svc_export_parse, .cache_show = svc_export_show, + .match = svc_export_match, + .init = svc_export_init, + .update = export_update, + .alloc = svc_export_alloc, }; -static inline int svc_export_match(struct svc_export *a, struct svc_export *b) +static struct svc_export * +svc_export_lookup(struct svc_export *exp) { - return a->ex_client == b->ex_client && - a->ex_dentry == b->ex_dentry && - a->ex_mnt == b->ex_mnt; -} -static inline void svc_export_init(struct svc_export *new, struct svc_export *item) -{ - kref_get(&item->ex_client->ref); - new->ex_client = item->ex_client; - new->ex_dentry = dget(item->ex_dentry); - new->ex_mnt = mntget(item->ex_mnt); + struct cache_head *ch; + int hash; + hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS); + hash ^= hash_ptr(exp->ex_dentry, EXPORT_HASHBITS); + hash ^= hash_ptr(exp->ex_mnt, EXPORT_HASHBITS); + + ch = sunrpc_cache_lookup(&svc_export_cache, &exp->h, + hash); + if (ch) + return container_of(ch, struct svc_export, h); + else + return NULL; } -static inline void svc_export_update(struct svc_export *new, struct svc_export *item) +struct svc_export * +svc_export_update(struct svc_export *new, struct svc_export *old) { - new->ex_flags = item->ex_flags; - new->ex_anon_uid = item->ex_anon_uid; - new->ex_anon_gid = item->ex_anon_gid; - new->ex_fsid = item->ex_fsid; -} + struct cache_head *ch; + int hash; + hash = hash_ptr(old->ex_client, EXPORT_HASHBITS); + hash ^= hash_ptr(old->ex_dentry, EXPORT_HASHBITS); + hash ^= hash_ptr(old->ex_mnt, EXPORT_HASHBITS); -static DefineSimpleCacheLookup(svc_export, svc_export) + ch = sunrpc_cache_update(&svc_export_cache, &new->h, + &old->h, + hash); + if (ch) + return container_of(ch, struct svc_export, h); + else + return NULL; +} struct svc_expkey * @@ -568,7 +619,7 @@ exp_get_by_name(svc_client *clp, struct vfsmount *mnt, struct dentry *dentry, key.ex_mnt = mnt; key.ex_dentry = dentry; - exp = svc_export_lookup(&key, 0); + exp = svc_export_lookup(&key); if (exp != NULL) switch (cache_check(&svc_export_cache, &exp->h, reqp)) { case 0: break; @@ -770,13 +821,13 @@ exp_export(struct nfsctl_export *nxp) new.ex_anon_gid = nxp->ex_anon_gid; new.ex_fsid = nxp->ex_dev; - exp = svc_export_lookup(&new, 1); + exp = svc_export_lookup(&new); + if (exp) + exp = svc_export_update(&new, exp); - if (exp == NULL) + if (!exp) goto finish; - err = 0; - if (exp_hash(clp, exp) || exp_fsid_hash(clp, exp)) { /* failed to create at least one index */ From 8d270f7f4cdab792d7263926ac59f747258735ee Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:15:04 -0800 Subject: [PATCH 072/306] [PATCH] knfsd: Use new cache_lookup for svc_expkey cache Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/export.c | 142 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 102 insertions(+), 40 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 2ebd77d758bc..abd68965822f 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -57,17 +57,6 @@ static int exp_verify_string(char *cp, int max); #define EXPKEY_HASHMASK (EXPKEY_HASHMAX -1) static struct cache_head *expkey_table[EXPKEY_HASHMAX]; -static inline int svc_expkey_hash(struct svc_expkey *item) -{ - int hash = item->ek_fsidtype; - char * cp = (char*)item->ek_fsid; - int len = key_len(item->ek_fsidtype); - - hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); - hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS); - return hash & EXPKEY_HASHMASK; -} - void expkey_put(struct cache_head *item, struct cache_detail *cd) { if (cache_put(item, cd)) { @@ -97,7 +86,8 @@ static void expkey_request(struct cache_detail *cd, (*bpp)[-1] = '\n'; } -static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *, int); +static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old); +static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *); static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) { /* client fsidtype fsid [path] */ @@ -108,6 +98,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) int fsidtype; char *ep; struct svc_expkey key; + struct svc_expkey *ek; if (mesg[mlen-1] != '\n') return -EINVAL; @@ -152,20 +143,25 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) key.ek_fsidtype = fsidtype; memcpy(key.ek_fsid, buf, len); + ek = svc_expkey_lookup(&key); + err = -ENOMEM; + if (!ek) + goto out; + /* now we want a pathname, or empty meaning NEGATIVE */ + err = -EINVAL; if ((len=qword_get(&mesg, buf, PAGE_SIZE)) < 0) goto out; dprintk("Path seems to be <%s>\n", buf); err = 0; if (len == 0) { - struct svc_expkey *ek; set_bit(CACHE_NEGATIVE, &key.h.flags); - ek = svc_expkey_lookup(&key, 1); + ek = svc_expkey_update(&key, ek); if (ek) expkey_put(&ek->h, &svc_expkey_cache); + else err = -ENOMEM; } else { struct nameidata nd; - struct svc_expkey *ek; err = path_lookup(buf, 0, &nd); if (err) goto out; @@ -174,10 +170,11 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) key.ek_mnt = nd.mnt; key.ek_dentry = nd.dentry; - ek = svc_expkey_lookup(&key, 1); + ek = svc_expkey_update(&key, ek); if (ek) expkey_put(&ek->h, &svc_expkey_cache); - err = 0; + else + err = -ENOMEM; path_release(&nd); } cache_flush(); @@ -213,7 +210,52 @@ static int expkey_show(struct seq_file *m, seq_printf(m, "\n"); return 0; } - + +static inline int expkey_match (struct cache_head *a, struct cache_head *b) +{ + struct svc_expkey *orig = container_of(a, struct svc_expkey, h); + struct svc_expkey *new = container_of(b, struct svc_expkey, h); + + if (orig->ek_fsidtype != new->ek_fsidtype || + orig->ek_client != new->ek_client || + memcmp(orig->ek_fsid, new->ek_fsid, key_len(orig->ek_fsidtype)) != 0) + return 0; + return 1; +} + +static inline void expkey_init(struct cache_head *cnew, + struct cache_head *citem) +{ + struct svc_expkey *new = container_of(cnew, struct svc_expkey, h); + struct svc_expkey *item = container_of(citem, struct svc_expkey, h); + + kref_get(&item->ek_client->ref); + new->ek_client = item->ek_client; + new->ek_fsidtype = item->ek_fsidtype; + new->ek_fsid[0] = item->ek_fsid[0]; + new->ek_fsid[1] = item->ek_fsid[1]; + new->ek_fsid[2] = item->ek_fsid[2]; +} + +static inline void expkey_update(struct cache_head *cnew, + struct cache_head *citem) +{ + struct svc_expkey *new = container_of(cnew, struct svc_expkey, h); + struct svc_expkey *item = container_of(citem, struct svc_expkey, h); + + new->ek_mnt = mntget(item->ek_mnt); + new->ek_dentry = dget(item->ek_dentry); +} + +static struct cache_head *expkey_alloc(void) +{ + struct svc_expkey *i = kmalloc(sizeof(*i), GFP_KERNEL); + if (i) + return &i->h; + else + return NULL; +} + struct cache_detail svc_expkey_cache = { .owner = THIS_MODULE, .hash_size = EXPKEY_HASHMAX, @@ -223,34 +265,52 @@ struct cache_detail svc_expkey_cache = { .cache_request = expkey_request, .cache_parse = expkey_parse, .cache_show = expkey_show, + .match = expkey_match, + .init = expkey_init, + .update = expkey_update, + .alloc = expkey_alloc, }; -static inline int svc_expkey_match (struct svc_expkey *a, struct svc_expkey *b) +static struct svc_expkey * +svc_expkey_lookup(struct svc_expkey *item) { - if (a->ek_fsidtype != b->ek_fsidtype || - a->ek_client != b->ek_client || - memcmp(a->ek_fsid, b->ek_fsid, key_len(a->ek_fsidtype)) != 0) - return 0; - return 1; + struct cache_head *ch; + int hash = item->ek_fsidtype; + char * cp = (char*)item->ek_fsid; + int len = key_len(item->ek_fsidtype); + + hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); + hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS); + hash &= EXPKEY_HASHMASK; + + ch = sunrpc_cache_lookup(&svc_expkey_cache, &item->h, + hash); + if (ch) + return container_of(ch, struct svc_expkey, h); + else + return NULL; } -static inline void svc_expkey_init(struct svc_expkey *new, struct svc_expkey *item) +static struct svc_expkey * +svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old) { - kref_get(&item->ek_client->ref); - new->ek_client = item->ek_client; - new->ek_fsidtype = item->ek_fsidtype; - new->ek_fsid[0] = item->ek_fsid[0]; - new->ek_fsid[1] = item->ek_fsid[1]; - new->ek_fsid[2] = item->ek_fsid[2]; + struct cache_head *ch; + int hash = new->ek_fsidtype; + char * cp = (char*)new->ek_fsid; + int len = key_len(new->ek_fsidtype); + + hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); + hash ^= hash_ptr(new->ek_client, EXPKEY_HASHBITS); + hash &= EXPKEY_HASHMASK; + + ch = sunrpc_cache_update(&svc_expkey_cache, &new->h, + &old->h, hash); + if (ch) + return container_of(ch, struct svc_expkey, h); + else + return NULL; } -static inline void svc_expkey_update(struct svc_expkey *new, struct svc_expkey *item) -{ - new->ek_mnt = mntget(item->ek_mnt); - new->ek_dentry = dget(item->ek_dentry); -} - -static DefineSimpleCacheLookup(svc_expkey, svc_expkey) #define EXPORT_HASHBITS 8 #define EXPORT_HASHMAX (1<< EXPORT_HASHBITS) @@ -549,7 +609,7 @@ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp) key.ek_fsidtype = fsid_type; memcpy(key.ek_fsid, fsidv, key_len(fsid_type)); - ek = svc_expkey_lookup(&key, 0); + ek = svc_expkey_lookup(&key); if (ek != NULL) if ((err = cache_check(&svc_expkey_cache, &ek->h, reqp))) ek = ERR_PTR(err); @@ -569,7 +629,9 @@ static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv, key.h.expiry_time = NEVER; key.h.flags = 0; - ek = svc_expkey_lookup(&key, 1); + ek = svc_expkey_lookup(&key); + if (ek) + ek = svc_expkey_update(&key,ek); if (ek) { expkey_put(&ek->h, &svc_expkey_cache); return 0; From d4d11ea9d6d6fcfaa66fbd10fd4a8d43aa575597 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:15:04 -0800 Subject: [PATCH 073/306] [PATCH] knfsd: Use new sunrpc cache for rsi cache Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/sunrpc/auth_gss/svcauth_gss.c | 66 +++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 7 deletions(-) diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index aadb4e8d6aa7..237d935747a5 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -78,7 +78,8 @@ struct rsi { static struct cache_head *rsi_table[RSI_HASHMAX]; static struct cache_detail rsi_cache; -static struct rsi *rsi_lookup(struct rsi *item, int set); +static struct rsi *rsi_update(struct rsi *new, struct rsi *old); +static struct rsi *rsi_lookup(struct rsi *item); static void rsi_free(struct rsi *rsii) { @@ -103,8 +104,10 @@ static inline int rsi_hash(struct rsi *item) ^ hash_mem(item->in_token.data, item->in_token.len, RSI_HASHBITS); } -static inline int rsi_match(struct rsi *item, struct rsi *tmp) +static int rsi_match(struct cache_head *a, struct cache_head *b) { + struct rsi *item = container_of(a, struct rsi, h); + struct rsi *tmp = container_of(b, struct rsi, h); return netobj_equal(&item->in_handle, &tmp->in_handle) && netobj_equal(&item->in_token, &tmp->in_token); } @@ -125,8 +128,11 @@ static inline int dup_netobj(struct xdr_netobj *dst, struct xdr_netobj *src) return dup_to_netobj(dst, src->data, src->len); } -static inline void rsi_init(struct rsi *new, struct rsi *item) +static void rsi_init(struct cache_head *cnew, struct cache_head *citem) { + struct rsi *new = container_of(cnew, struct rsi, h); + struct rsi *item = container_of(citem, struct rsi, h); + new->out_handle.data = NULL; new->out_handle.len = 0; new->out_token.data = NULL; @@ -141,8 +147,11 @@ static inline void rsi_init(struct rsi *new, struct rsi *item) item->in_token.data = NULL; } -static inline void rsi_update(struct rsi *new, struct rsi *item) +static void update_rsi(struct cache_head *cnew, struct cache_head *citem) { + struct rsi *new = container_of(cnew, struct rsi, h); + struct rsi *item = container_of(citem, struct rsi, h); + BUG_ON(new->out_handle.data || new->out_token.data); new->out_handle.len = item->out_handle.len; item->out_handle.len = 0; @@ -157,6 +166,15 @@ static inline void rsi_update(struct rsi *new, struct rsi *item) new->minor_status = item->minor_status; } +static struct cache_head *rsi_alloc(void) +{ + struct rsi *rsii = kmalloc(sizeof(*rsii), GFP_KERNEL); + if (rsii) + return &rsii->h; + else + return NULL; +} + static void rsi_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) @@ -198,6 +216,10 @@ static int rsi_parse(struct cache_detail *cd, if (dup_to_netobj(&rsii.in_token, buf, len)) goto out; + rsip = rsi_lookup(&rsii); + if (!rsip) + goto out; + rsii.h.flags = 0; /* expiry */ expiry = get_expiry(&mesg); @@ -240,12 +262,14 @@ static int rsi_parse(struct cache_detail *cd, goto out; } rsii.h.expiry_time = expiry; - rsip = rsi_lookup(&rsii, 1); + rsip = rsi_update(&rsii, rsip); status = 0; out: rsi_free(&rsii); if (rsip) rsi_put(&rsip->h, &rsi_cache); + else + status = -ENOMEM; return status; } @@ -257,9 +281,37 @@ static struct cache_detail rsi_cache = { .cache_put = rsi_put, .cache_request = rsi_request, .cache_parse = rsi_parse, + .match = rsi_match, + .init = rsi_init, + .update = update_rsi, + .alloc = rsi_alloc, }; -static DefineSimpleCacheLookup(rsi, rsi) +static struct rsi *rsi_lookup(struct rsi *item) +{ + struct cache_head *ch; + int hash = rsi_hash(item); + + ch = sunrpc_cache_lookup(&rsi_cache, &item->h, hash); + if (ch) + return container_of(ch, struct rsi, h); + else + return NULL; +} + +static struct rsi *rsi_update(struct rsi *new, struct rsi *old) +{ + struct cache_head *ch; + int hash = rsi_hash(new); + + ch = sunrpc_cache_update(&rsi_cache, &new->h, + &old->h, hash); + if (ch) + return container_of(ch, struct rsi, h); + else + return NULL; +} + /* * The rpcsec_context cache is used to store a context that is @@ -895,7 +947,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, u32 *authp) goto drop; } - rsip = rsi_lookup(&rsikey, 0); + rsip = rsi_lookup(&rsikey); rsi_free(&rsikey); if (!rsip) { goto drop; From 17f834b6d2e0b102ab53d73ba44d5dbb34c38f90 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:15:05 -0800 Subject: [PATCH 074/306] [PATCH] knfsd: Use new cache code for rsc cache Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/sunrpc/auth_gss/svcauth_gss.c | 74 ++++++++++++++++++++++++++----- 1 file changed, 64 insertions(+), 10 deletions(-) diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 237d935747a5..380152603d1e 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -345,7 +345,8 @@ struct rsc { static struct cache_head *rsc_table[RSC_HASHMAX]; static struct cache_detail rsc_cache; -static struct rsc *rsc_lookup(struct rsc *item, int set); +static struct rsc *rsc_update(struct rsc *new, struct rsc *old); +static struct rsc *rsc_lookup(struct rsc *item); static void rsc_free(struct rsc *rsci) { @@ -372,15 +373,21 @@ rsc_hash(struct rsc *rsci) return hash_mem(rsci->handle.data, rsci->handle.len, RSC_HASHBITS); } -static inline int -rsc_match(struct rsc *new, struct rsc *tmp) +static int +rsc_match(struct cache_head *a, struct cache_head *b) { + struct rsc *new = container_of(a, struct rsc, h); + struct rsc *tmp = container_of(b, struct rsc, h); + return netobj_equal(&new->handle, &tmp->handle); } -static inline void -rsc_init(struct rsc *new, struct rsc *tmp) +static void +rsc_init(struct cache_head *cnew, struct cache_head *ctmp) { + struct rsc *new = container_of(cnew, struct rsc, h); + struct rsc *tmp = container_of(ctmp, struct rsc, h); + new->handle.len = tmp->handle.len; tmp->handle.len = 0; new->handle.data = tmp->handle.data; @@ -389,9 +396,12 @@ rsc_init(struct rsc *new, struct rsc *tmp) new->cred.cr_group_info = NULL; } -static inline void -rsc_update(struct rsc *new, struct rsc *tmp) +static void +update_rsc(struct cache_head *cnew, struct cache_head *ctmp) { + struct rsc *new = container_of(cnew, struct rsc, h); + struct rsc *tmp = container_of(ctmp, struct rsc, h); + new->mechctx = tmp->mechctx; tmp->mechctx = NULL; memset(&new->seqdata, 0, sizeof(new->seqdata)); @@ -400,6 +410,16 @@ rsc_update(struct rsc *new, struct rsc *tmp) tmp->cred.cr_group_info = NULL; } +static struct cache_head * +rsc_alloc(void) +{ + struct rsc *rsci = kmalloc(sizeof(*rsci), GFP_KERNEL); + if (rsci) + return &rsci->h; + else + return NULL; +} + static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen) { @@ -425,6 +445,10 @@ static int rsc_parse(struct cache_detail *cd, if (expiry == 0) goto out; + rscp = rsc_lookup(&rsci); + if (!rscp) + goto out; + /* uid, or NEGATIVE */ rv = get_int(&mesg, &rsci.cred.cr_uid); if (rv == -EINVAL) @@ -480,12 +504,14 @@ static int rsc_parse(struct cache_detail *cd, gss_mech_put(gm); } rsci.h.expiry_time = expiry; - rscp = rsc_lookup(&rsci, 1); + rscp = rsc_update(&rsci, rscp); status = 0; out: rsc_free(&rsci); if (rscp) rsc_put(&rscp->h, &rsc_cache); + else + status = -ENOMEM; return status; } @@ -496,9 +522,37 @@ static struct cache_detail rsc_cache = { .name = "auth.rpcsec.context", .cache_put = rsc_put, .cache_parse = rsc_parse, + .match = rsc_match, + .init = rsc_init, + .update = update_rsc, + .alloc = rsc_alloc, }; -static DefineSimpleCacheLookup(rsc, rsc); +static struct rsc *rsc_lookup(struct rsc *item) +{ + struct cache_head *ch; + int hash = rsc_hash(item); + + ch = sunrpc_cache_lookup(&rsc_cache, &item->h, hash); + if (ch) + return container_of(ch, struct rsc, h); + else + return NULL; +} + +static struct rsc *rsc_update(struct rsc *new, struct rsc *old) +{ + struct cache_head *ch; + int hash = rsc_hash(new); + + ch = sunrpc_cache_update(&rsc_cache, &new->h, + &old->h, hash); + if (ch) + return container_of(ch, struct rsc, h); + else + return NULL; +} + static struct rsc * gss_svc_searchbyctx(struct xdr_netobj *handle) @@ -509,7 +563,7 @@ gss_svc_searchbyctx(struct xdr_netobj *handle) memset(&rsci, 0, sizeof(rsci)); if (dup_to_netobj(&rsci.handle, handle->data, handle->len)) return NULL; - found = rsc_lookup(&rsci, 0); + found = rsc_lookup(&rsci); rsc_free(&rsci); if (!found) return NULL; From f9ecc921b5b5e135050e7f22fef873c249aee3fc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:15:06 -0800 Subject: [PATCH 075/306] [PATCH] knfsd: Use new cache code for name/id lookup caches Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4idmap.c | 126 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 103 insertions(+), 23 deletions(-) diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index dea690aa8bb5..75cfbb68b205 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -82,9 +82,12 @@ struct ent { #define ENT_HASHMAX (1 << ENT_HASHBITS) #define ENT_HASHMASK (ENT_HASHMAX - 1) -static inline void -ent_init(struct ent *new, struct ent *itm) +static void +ent_init(struct cache_head *cnew, struct cache_head *citm) { + struct ent *new = container_of(cnew, struct ent, h); + struct ent *itm = container_of(citm, struct ent, h); + new->id = itm->id; new->type = itm->type; @@ -92,12 +95,6 @@ ent_init(struct ent *new, struct ent *itm) strlcpy(new->authname, itm->authname, sizeof(new->name)); } -static inline void -ent_update(struct ent *new, struct ent *itm) -{ - ent_init(new, itm); -} - static void ent_put(struct cache_head *ch, struct cache_detail *cd) { @@ -107,6 +104,16 @@ ent_put(struct cache_head *ch, struct cache_detail *cd) } } +static struct cache_head * +ent_alloc(void) +{ + struct ent *e = kmalloc(sizeof(*e), GFP_KERNEL); + if (e) + return &e->h; + else + return NULL; +} + /* * ID -> Name cache */ @@ -143,9 +150,12 @@ idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, (*bpp)[-1] = '\n'; } -static inline int -idtoname_match(struct ent *a, struct ent *b) +static int +idtoname_match(struct cache_head *ca, struct cache_head *cb) { + struct ent *a = container_of(ca, struct ent, h); + struct ent *b = container_of(cb, struct ent, h); + return (a->id == b->id && a->type == b->type && strcmp(a->authname, b->authname) == 0); } @@ -178,7 +188,8 @@ warn_no_idmapd(struct cache_detail *detail) static int idtoname_parse(struct cache_detail *, char *, int); -static struct ent *idtoname_lookup(struct ent *, int); +static struct ent *idtoname_lookup(struct ent *); +static struct ent *idtoname_update(struct ent *, struct ent *); static struct cache_detail idtoname_cache = { .owner = THIS_MODULE, @@ -190,6 +201,10 @@ static struct cache_detail idtoname_cache = { .cache_parse = idtoname_parse, .cache_show = idtoname_show, .warn_no_listener = warn_no_idmapd, + .match = idtoname_match, + .init = ent_init, + .update = ent_init, + .alloc = ent_alloc, }; int @@ -232,6 +247,11 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen) if (ent.h.expiry_time == 0) goto out; + error = -ENOMEM; + res = idtoname_lookup(&ent); + if (!res) + goto out; + /* Name */ error = qword_get(&buf, buf1, PAGE_SIZE); if (error == -EINVAL) @@ -246,7 +266,8 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen) memcpy(ent.name, buf1, sizeof(ent.name)); } error = -ENOMEM; - if ((res = idtoname_lookup(&ent, 1)) == NULL) + res = idtoname_update(&ent, res); + if (res == NULL) goto out; ent_put(&res->h, &idtoname_cache); @@ -258,7 +279,31 @@ out: return error; } -static DefineSimpleCacheLookup(ent, idtoname); + +static struct ent * +idtoname_lookup(struct ent *item) +{ + struct cache_head *ch = sunrpc_cache_lookup(&idtoname_cache, + &item->h, + idtoname_hash(item)); + if (ch) + return container_of(ch, struct ent, h); + else + return NULL; +} + +static struct ent * +idtoname_update(struct ent *new, struct ent *old) +{ + struct cache_head *ch = sunrpc_cache_update(&idtoname_cache, + &new->h, &old->h, + idtoname_hash(new)); + if (ch) + return container_of(ch, struct ent, h); + else + return NULL; +} + /* * Name -> ID cache @@ -285,9 +330,12 @@ nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, (*bpp)[-1] = '\n'; } -static inline int -nametoid_match(struct ent *a, struct ent *b) +static int +nametoid_match(struct cache_head *ca, struct cache_head *cb) { + struct ent *a = container_of(ca, struct ent, h); + struct ent *b = container_of(cb, struct ent, h); + return (a->type == b->type && strcmp(a->name, b->name) == 0 && strcmp(a->authname, b->authname) == 0); } @@ -311,7 +359,8 @@ nametoid_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) return 0; } -static struct ent *nametoid_lookup(struct ent *, int); +static struct ent *nametoid_lookup(struct ent *); +static struct ent *nametoid_update(struct ent *, struct ent *); static int nametoid_parse(struct cache_detail *, char *, int); static struct cache_detail nametoid_cache = { @@ -324,6 +373,10 @@ static struct cache_detail nametoid_cache = { .cache_parse = nametoid_parse, .cache_show = nametoid_show, .warn_no_listener = warn_no_idmapd, + .match = nametoid_match, + .init = ent_init, + .update = ent_init, + .alloc = ent_alloc, }; static int @@ -373,7 +426,11 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen) set_bit(CACHE_NEGATIVE, &ent.h.flags); error = -ENOMEM; - if ((res = nametoid_lookup(&ent, 1)) == NULL) + res = nametoid_lookup(&ent); + if (res == NULL) + goto out; + res = nametoid_update(&ent, res); + if (res == NULL) goto out; ent_put(&res->h, &nametoid_cache); @@ -384,7 +441,30 @@ out: return (error); } -static DefineSimpleCacheLookup(ent, nametoid); + +static struct ent * +nametoid_lookup(struct ent *item) +{ + struct cache_head *ch = sunrpc_cache_lookup(&nametoid_cache, + &item->h, + nametoid_hash(item)); + if (ch) + return container_of(ch, struct ent, h); + else + return NULL; +} + +static struct ent * +nametoid_update(struct ent *new, struct ent *old) +{ + struct cache_head *ch = sunrpc_cache_update(&nametoid_cache, + &new->h, &old->h, + nametoid_hash(new)); + if (ch) + return container_of(ch, struct ent, h); + else + return NULL; +} /* * Exported API @@ -452,24 +532,24 @@ idmap_defer(struct cache_req *req) } static inline int -do_idmap_lookup(struct ent *(*lookup_fn)(struct ent *, int), struct ent *key, +do_idmap_lookup(struct ent *(*lookup_fn)(struct ent *), struct ent *key, struct cache_detail *detail, struct ent **item, struct idmap_defer_req *mdr) { - *item = lookup_fn(key, 0); + *item = lookup_fn(key); if (!*item) return -ENOMEM; return cache_check(detail, &(*item)->h, &mdr->req); } static inline int -do_idmap_lookup_nowait(struct ent *(*lookup_fn)(struct ent *, int), +do_idmap_lookup_nowait(struct ent *(*lookup_fn)(struct ent *), struct ent *key, struct cache_detail *detail, struct ent **item) { int ret = -ENOMEM; - *item = lookup_fn(key, 0); + *item = lookup_fn(key); if (!*item) goto out_err; ret = -ETIMEDOUT; @@ -490,7 +570,7 @@ out_err: static int idmap_lookup(struct svc_rqst *rqstp, - struct ent *(*lookup_fn)(struct ent *, int), struct ent *key, + struct ent *(*lookup_fn)(struct ent *), struct ent *key, struct cache_detail *detail, struct ent **item) { struct idmap_defer_req *mdr; From 4013edea9a0b6cdcb1fdf5d4011e47e068fd6efb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:15:07 -0800 Subject: [PATCH 076/306] [PATCH] knfsd: An assortment of little fixes to the sunrpc cache code - in cache_check, h must be non-NULL as it has been de-referenced, so don't bother checking for NULL. - When a cache-item is updated, we need to call cache_revisit_request to see if there is a pending request waiting for that item. We were using a transition to CACHE_VALID to see if that was needed, however that is wrong as an expired entry will still be marked 'valid' (as the data is valid and will need to be released). So instead use an off transition for CACHE_PENDING which is exactly the right thing to test. - Add a little bit more debugging info. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/sunrpc/cache.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 4449dc52edf5..b242f491cea9 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -208,7 +208,7 @@ int cache_check(struct cache_detail *detail, if (rv == -EAGAIN) cache_defer_req(rqstp, h); - if (rv && h) + if (rv) detail->cache_put(h, detail); return rv; } @@ -223,8 +223,10 @@ void cache_fresh(struct cache_detail *detail, head->last_refresh = get_seconds(); if (!test_and_set_bit(CACHE_VALID, &head->flags)) cache_revisit_request(head); - if (test_and_clear_bit(CACHE_PENDING, &head->flags)) + if (test_and_clear_bit(CACHE_PENDING, &head->flags)) { + cache_revisit_request(head); queue_loose(detail, head); + } } /* @@ -551,7 +553,7 @@ static void cache_defer_req(struct cache_req *req, struct cache_head *item) /* there was one too many */ dreq->revisit(dreq, 1); } - if (test_bit(CACHE_VALID, &item->flags)) { + if (!test_bit(CACHE_PENDING, &item->flags)) { /* must have just been validated... */ cache_revisit_request(item); } @@ -892,7 +894,7 @@ static void queue_loose(struct cache_detail *detail, struct cache_head *ch) if (cr->item != ch) continue; if (cr->readers != 0) - break; + continue; list_del(&cr->q.list); spin_unlock(&queue_lock); detail->cache_put(cr->item, detail); @@ -1180,8 +1182,8 @@ static int c_show(struct seq_file *m, void *p) return cd->cache_show(m, cd, NULL); ifdebug(CACHE) - seq_printf(m, "# expiry=%ld refcnt=%d\n", - cp->expiry_time, atomic_read(&cp->refcnt)); + seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n", + cp->expiry_time, atomic_read(&cp->refcnt), cp->flags); cache_get(cp); if (cache_check(cd, cp, NULL)) /* cache_check does a cache_put on failure */ From 4d90452cb23b08a9a9dd001010f0ee6b1ee83a45 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:15:07 -0800 Subject: [PATCH 077/306] [PATCH] knfsd: Remove DefineCacheLookup This has been replaced by more traditional code. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sunrpc/cache.h | 113 ----------------------------------- 1 file changed, 113 deletions(-) diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 3e17a5ff1dea..afc481dd02dd 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -128,119 +128,6 @@ struct cache_deferred_req { int too_many); }; -/* - * just like a template in C++, this macro does cache lookup - * for us. - * The function is passed some sort of HANDLE from which a cache_detail - * structure can be determined (via SETUP, DETAIL), a template - * cache entry (type RTN*), and a "set" flag. Using the HASHFN and the - * TEST, the function will try to find a matching cache entry in the cache. - * If "set" == 0 : - * If an entry is found, it is returned - * If no entry is found, a new non-VALID entry is created. - * If "set" == 1 : - * If no entry is found a new one is inserted with data from "template" - * If a non-CACHE_VALID entry is found, it is updated from template using UPDATE - * If a CACHE_VALID entry is found, a new entry is swapped in with data - * from "template" - * - * If the passed handle has the CACHE_NEGATIVE flag set, then UPDATE is not - * run but insteead CACHE_NEGATIVE is set in any new item. - - * In any case, the new entry is returned with a reference count. - * - * - * RTN is a struct type for a cache entry - * MEMBER is the member of the cache which is cache_head, which must be first - * FNAME is the name for the function - * ARGS are arguments to function and must contain RTN *item, int set. May - * also contain something to be usedby SETUP or DETAIL to find cache_detail. - * SETUP locates the cache detail and makes it available as... - * DETAIL identifies the cache detail, possibly set up by SETUP - * HASHFN returns a hash value of the cache entry "item" - * TEST tests if "tmp" matches "item" - * INIT copies key information from "item" to "new" - * UPDATE copies content information from "item" to "tmp" - */ -#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,SETUP,DETAIL,HASHFN,TEST,INIT,UPDATE) \ -RTN *FNAME ARGS \ -{ \ - RTN *tmp, *new=NULL; \ - struct cache_head **hp, **head; \ - SETUP; \ - head = &(DETAIL)->hash_table[HASHFN]; \ - retry: \ - if (set||new) write_lock(&(DETAIL)->hash_lock); \ - else read_lock(&(DETAIL)->hash_lock); \ - for(hp=head; *hp != NULL; hp = &tmp->MEMBER.next) { \ - tmp = container_of(*hp, RTN, MEMBER); \ - if (TEST) { /* found a match */ \ - \ - if (set && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new) \ - break; \ - \ - if (new) \ - {INIT;} \ - if (set) { \ - if (test_bit(CACHE_VALID, &tmp->MEMBER.flags))\ - { /* need to swap in new */ \ - RTN *t2; \ - \ - new->MEMBER.next = tmp->MEMBER.next; \ - *hp = &new->MEMBER; \ - tmp->MEMBER.next = NULL; \ - t2 = tmp; tmp = new; new = t2; \ - } \ - if (test_bit(CACHE_NEGATIVE, &item->MEMBER.flags)) \ - set_bit(CACHE_NEGATIVE, &tmp->MEMBER.flags); \ - else { \ - UPDATE; \ - clear_bit(CACHE_NEGATIVE, &tmp->MEMBER.flags); \ - } \ - } \ - cache_get(&tmp->MEMBER); \ - if (set||new) write_unlock(&(DETAIL)->hash_lock); \ - else read_unlock(&(DETAIL)->hash_lock); \ - if (set) \ - cache_fresh(DETAIL, &tmp->MEMBER, item->MEMBER.expiry_time); \ - if (set && new) cache_fresh(DETAIL, &new->MEMBER, 0); \ - if (new) (DETAIL)->cache_put(&new->MEMBER, DETAIL); \ - return tmp; \ - } \ - } \ - /* Didn't find anything */ \ - if (new) { \ - INIT; \ - new->MEMBER.next = *head; \ - *head = &new->MEMBER; \ - (DETAIL)->entries ++; \ - cache_get(&new->MEMBER); \ - if (set) { \ - tmp = new; \ - if (test_bit(CACHE_NEGATIVE, &item->MEMBER.flags)) \ - set_bit(CACHE_NEGATIVE, &tmp->MEMBER.flags); \ - else {UPDATE;} \ - } \ - } \ - if (set||new) write_unlock(&(DETAIL)->hash_lock); \ - else read_unlock(&(DETAIL)->hash_lock); \ - if (new && set) \ - cache_fresh(DETAIL, &new->MEMBER, item->MEMBER.expiry_time); \ - if (new) \ - return new; \ - new = kmalloc(sizeof(*new), GFP_KERNEL); \ - if (new) { \ - cache_init(&new->MEMBER); \ - goto retry; \ - } \ - return NULL; \ -} - -#define DefineSimpleCacheLookup(STRUCT, FUNC) \ - DefineCacheLookup(struct STRUCT, h, FUNC##_lookup, \ - (struct STRUCT *item, int set), /*no setup */, \ - & FUNC##_cache, FUNC##_hash(item), FUNC##_match(item, tmp), \ - STRUCT##_init(new, item), STRUCT##_update(tmp, item)) extern struct cache_head * sunrpc_cache_lookup(struct cache_detail *detail, From ebd0cb1af3be2729cc1f574681dfba01fcf458d9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:15:08 -0800 Subject: [PATCH 078/306] [PATCH] knfsd: Unexport cache_fresh and fix a small race Cache_fresh is now only used in cache.c, so unexport it. Part of cache_fresh (setting CACHE_VALID) should really be done under the lock, while part (calling cache_revisit_request etc) must be done outside the lock. So we split it up appropriately. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sunrpc/cache.h | 2 -- net/sunrpc/cache.c | 51 +++++++++++++++++++++--------------- net/sunrpc/sunrpc_syms.c | 1 - 3 files changed, 30 insertions(+), 24 deletions(-) diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index afc481dd02dd..a37fead1873b 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -165,8 +165,6 @@ static inline int cache_put(struct cache_head *h, struct cache_detail *cd) } extern void cache_init(struct cache_head *h); -extern void cache_fresh(struct cache_detail *detail, - struct cache_head *head, time_t expiry); extern int cache_check(struct cache_detail *detail, struct cache_head *h, struct cache_req *rqstp); extern void cache_flush(void); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index b242f491cea9..edcda4fd88e8 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -96,6 +96,27 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, } EXPORT_SYMBOL(sunrpc_cache_lookup); + +static void queue_loose(struct cache_detail *detail, struct cache_head *ch); + +static int cache_fresh_locked(struct cache_head *head, time_t expiry) +{ + head->expiry_time = expiry; + head->last_refresh = get_seconds(); + return !test_and_set_bit(CACHE_VALID, &head->flags); +} + +static void cache_fresh_unlocked(struct cache_head *head, + struct cache_detail *detail, int new) +{ + if (new) + cache_revisit_request(head); + if (test_and_clear_bit(CACHE_PENDING, &head->flags)) { + cache_revisit_request(head); + queue_loose(detail, head); + } +} + struct cache_head *sunrpc_cache_update(struct cache_detail *detail, struct cache_head *new, struct cache_head *old, int hash) { @@ -105,6 +126,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, */ struct cache_head **head; struct cache_head *tmp; + int is_new; if (!test_bit(CACHE_VALID, &old->flags)) { write_lock(&detail->hash_lock); @@ -113,9 +135,9 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, set_bit(CACHE_NEGATIVE, &old->flags); else detail->update(old, new); - /* FIXME cache_fresh should come first */ + is_new = cache_fresh_locked(old, new->expiry_time); write_unlock(&detail->hash_lock); - cache_fresh(detail, old, new->expiry_time); + cache_fresh_unlocked(old, detail, is_new); return old; } write_unlock(&detail->hash_lock); @@ -138,9 +160,11 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, tmp->next = *head; *head = tmp; cache_get(tmp); + is_new = cache_fresh_locked(tmp, new->expiry_time); + cache_fresh_locked(old, 0); write_unlock(&detail->hash_lock); - cache_fresh(detail, tmp, new->expiry_time); - cache_fresh(detail, old, 0); + cache_fresh_unlocked(tmp, detail, is_new); + cache_fresh_unlocked(old, detail, 0); detail->cache_put(old, detail); return tmp; } @@ -192,7 +216,8 @@ int cache_check(struct cache_detail *detail, clear_bit(CACHE_PENDING, &h->flags); if (rv == -EAGAIN) { set_bit(CACHE_NEGATIVE, &h->flags); - cache_fresh(detail, h, get_seconds()+CACHE_NEW_EXPIRY); + cache_fresh_unlocked(h, detail, + cache_fresh_locked(h, get_seconds()+CACHE_NEW_EXPIRY)); rv = -ENOENT; } break; @@ -213,22 +238,6 @@ int cache_check(struct cache_detail *detail, return rv; } -static void queue_loose(struct cache_detail *detail, struct cache_head *ch); - -void cache_fresh(struct cache_detail *detail, - struct cache_head *head, time_t expiry) -{ - - head->expiry_time = expiry; - head->last_refresh = get_seconds(); - if (!test_and_set_bit(CACHE_VALID, &head->flags)) - cache_revisit_request(head); - if (test_and_clear_bit(CACHE_PENDING, &head->flags)) { - cache_revisit_request(head); - queue_loose(detail, head); - } -} - /* * caches need to be periodically cleaned. * For this we maintain a list of cache_detail and diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 40401196e7de..69b8238f3d10 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -105,7 +105,6 @@ EXPORT_SYMBOL(auth_unix_lookup); EXPORT_SYMBOL(cache_check); EXPORT_SYMBOL(cache_flush); EXPORT_SYMBOL(cache_purge); -EXPORT_SYMBOL(cache_fresh); EXPORT_SYMBOL(cache_init); EXPORT_SYMBOL(cache_register); EXPORT_SYMBOL(cache_unregister); From baab935ff3bdac20c558809da0d8e8f761840219 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:15:09 -0800 Subject: [PATCH 079/306] [PATCH] knfsd: Convert sunrpc_cache to use krefs .. it makes some of the code nicer. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/export.c | 51 +++++++++++++++---------------- fs/nfsd/nfs4idmap.c | 18 +++++------ fs/nfsd/nfsfh.c | 2 +- include/linux/nfsd/export.h | 4 +-- include/linux/sunrpc/cache.h | 13 ++++---- net/sunrpc/auth_gss/svcauth_gss.c | 28 ++++++++--------- net/sunrpc/cache.c | 20 ++++++------ net/sunrpc/svcauth_unix.c | 20 ++++++------ 8 files changed, 72 insertions(+), 84 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index abd68965822f..cc811a1094cb 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -57,18 +57,17 @@ static int exp_verify_string(char *cp, int max); #define EXPKEY_HASHMASK (EXPKEY_HASHMAX -1) static struct cache_head *expkey_table[EXPKEY_HASHMAX]; -void expkey_put(struct cache_head *item, struct cache_detail *cd) +void expkey_put(struct kref *ref) { - if (cache_put(item, cd)) { - struct svc_expkey *key = container_of(item, struct svc_expkey, h); - if (test_bit(CACHE_VALID, &item->flags) && - !test_bit(CACHE_NEGATIVE, &item->flags)) { - dput(key->ek_dentry); - mntput(key->ek_mnt); - } - auth_domain_put(key->ek_client); - kfree(key); + struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref); + + if (test_bit(CACHE_VALID, &key->h.flags) && + !test_bit(CACHE_NEGATIVE, &key->h.flags)) { + dput(key->ek_dentry); + mntput(key->ek_mnt); } + auth_domain_put(key->ek_client); + kfree(key); } static void expkey_request(struct cache_detail *cd, @@ -158,7 +157,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) set_bit(CACHE_NEGATIVE, &key.h.flags); ek = svc_expkey_update(&key, ek); if (ek) - expkey_put(&ek->h, &svc_expkey_cache); + cache_put(&ek->h, &svc_expkey_cache); else err = -ENOMEM; } else { struct nameidata nd; @@ -172,7 +171,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) ek = svc_expkey_update(&key, ek); if (ek) - expkey_put(&ek->h, &svc_expkey_cache); + cache_put(&ek->h, &svc_expkey_cache); else err = -ENOMEM; path_release(&nd); @@ -318,15 +317,13 @@ svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old) static struct cache_head *export_table[EXPORT_HASHMAX]; -void svc_export_put(struct cache_head *item, struct cache_detail *cd) +static void svc_export_put(struct kref *ref) { - if (cache_put(item, cd)) { - struct svc_export *exp = container_of(item, struct svc_export, h); - dput(exp->ex_dentry); - mntput(exp->ex_mnt); - auth_domain_put(exp->ex_client); - kfree(exp); - } + struct svc_export *exp = container_of(ref, struct svc_export, h.ref); + dput(exp->ex_dentry); + mntput(exp->ex_mnt); + auth_domain_put(exp->ex_client); + kfree(exp); } static void svc_export_request(struct cache_detail *cd, @@ -633,7 +630,7 @@ static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv, if (ek) ek = svc_expkey_update(&key,ek); if (ek) { - expkey_put(&ek->h, &svc_expkey_cache); + cache_put(&ek->h, &svc_expkey_cache); return 0; } return -ENOMEM; @@ -762,7 +759,7 @@ static void exp_fsid_unhash(struct svc_export *exp) ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid); if (ek && !IS_ERR(ek)) { ek->h.expiry_time = get_seconds()-1; - expkey_put(&ek->h, &svc_expkey_cache); + cache_put(&ek->h, &svc_expkey_cache); } svc_expkey_cache.nextcheck = get_seconds(); } @@ -800,7 +797,7 @@ static void exp_unhash(struct svc_export *exp) ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino); if (ek && !IS_ERR(ek)) { ek->h.expiry_time = get_seconds()-1; - expkey_put(&ek->h, &svc_expkey_cache); + cache_put(&ek->h, &svc_expkey_cache); } svc_expkey_cache.nextcheck = get_seconds(); } @@ -902,7 +899,7 @@ finish: if (exp) exp_put(exp); if (fsid_key && !IS_ERR(fsid_key)) - expkey_put(&fsid_key->h, &svc_expkey_cache); + cache_put(&fsid_key->h, &svc_expkey_cache); if (clp) auth_domain_put(clp); path_release(&nd); @@ -1030,7 +1027,7 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv, return ERR_PTR(PTR_ERR(ek)); exp = exp_get_by_name(clp, ek->ek_mnt, ek->ek_dentry, reqp); - expkey_put(&ek->h, &svc_expkey_cache); + cache_put(&ek->h, &svc_expkey_cache); if (!exp || IS_ERR(exp)) return ERR_PTR(PTR_ERR(exp)); @@ -1068,7 +1065,7 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp, else rv = fh_compose(fhp, exp, fsid_key->ek_dentry, NULL); - expkey_put(&fsid_key->h, &svc_expkey_cache); + cache_put(&fsid_key->h, &svc_expkey_cache); return rv; } @@ -1187,7 +1184,7 @@ static int e_show(struct seq_file *m, void *p) cache_get(&exp->h); if (cache_check(&svc_export_cache, &exp->h, NULL)) return 0; - if (cache_put(&exp->h, &svc_export_cache)) BUG(); + cache_put(&exp->h, &svc_export_cache); return svc_export_show(m, &svc_export_cache, cp); } diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 75cfbb68b205..4b6aa60dfceb 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -96,12 +96,10 @@ ent_init(struct cache_head *cnew, struct cache_head *citm) } static void -ent_put(struct cache_head *ch, struct cache_detail *cd) +ent_put(struct kref *ref) { - if (cache_put(ch, cd)) { - struct ent *map = container_of(ch, struct ent, h); - kfree(map); - } + struct ent *map = container_of(ref, struct ent, h.ref); + kfree(map); } static struct cache_head * @@ -270,7 +268,7 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen) if (res == NULL) goto out; - ent_put(&res->h, &idtoname_cache); + cache_put(&res->h, &idtoname_cache); error = 0; out: @@ -433,7 +431,7 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen) if (res == NULL) goto out; - ent_put(&res->h, &nametoid_cache); + cache_put(&res->h, &nametoid_cache); error = 0; out: kfree(buf1); @@ -562,7 +560,7 @@ do_idmap_lookup_nowait(struct ent *(*lookup_fn)(struct ent *), goto out_put; return 0; out_put: - ent_put(&(*item)->h, detail); + cache_put(&(*item)->h, detail); out_err: *item = NULL; return ret; @@ -613,7 +611,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen if (ret) return ret; *id = item->id; - ent_put(&item->h, &nametoid_cache); + cache_put(&item->h, &nametoid_cache); return 0; } @@ -635,7 +633,7 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) ret = strlen(item->name); BUG_ON(ret > IDMAP_NAMESZ); memcpy(name, item->name, ret); - ent_put(&item->h, &idtoname_cache); + cache_put(&item->h, &idtoname_cache); return ret; } diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 7a3e397b4ed3..3f2ec2e6d06c 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -506,7 +506,7 @@ fh_put(struct svc_fh *fhp) nfsd_nr_put++; } if (exp) { - svc_export_put(&exp->h, &svc_export_cache); + cache_put(&exp->h, &svc_export_cache); fhp->fh_export = NULL; } return; diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h index d52e0b7ad37b..a6c08a47b25c 100644 --- a/include/linux/nfsd/export.h +++ b/include/linux/nfsd/export.h @@ -102,13 +102,11 @@ int exp_rootfh(struct auth_domain *, int exp_pseudoroot(struct auth_domain *, struct svc_fh *fhp, struct cache_req *creq); int nfserrno(int errno); -extern void expkey_put(struct cache_head *item, struct cache_detail *cd); -extern void svc_export_put(struct cache_head *item, struct cache_detail *cd); extern struct cache_detail svc_export_cache, svc_expkey_cache; static inline void exp_put(struct svc_export *exp) { - svc_export_put(&exp->h, &svc_export_cache); + cache_put(&exp->h, &svc_export_cache); } static inline void exp_get(struct svc_export *exp) diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index a37fead1873b..ad3f5cbdb770 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -50,7 +50,7 @@ struct cache_head { time_t last_refresh; /* If CACHE_PENDING, this is when upcall * was sent, else this is when update was received */ - atomic_t refcnt; + struct kref ref; unsigned long flags; }; #define CACHE_VALID 0 /* Entry contains valid data */ @@ -68,8 +68,7 @@ struct cache_detail { atomic_t inuse; /* active user-space update or lookup */ char *name; - void (*cache_put)(struct cache_head *, - struct cache_detail*); + void (*cache_put)(struct kref *); void (*cache_request)(struct cache_detail *cd, struct cache_head *h, @@ -151,17 +150,17 @@ extern void cache_clean_deferred(void *owner); static inline struct cache_head *cache_get(struct cache_head *h) { - atomic_inc(&h->refcnt); + kref_get(&h->ref); return h; } -static inline int cache_put(struct cache_head *h, struct cache_detail *cd) +static inline void cache_put(struct cache_head *h, struct cache_detail *cd) { - if (atomic_read(&h->refcnt) <= 2 && + if (atomic_read(&h->ref.refcount) <= 2 && h->expiry_time < cd->nextcheck) cd->nextcheck = h->expiry_time; - return atomic_dec_and_test(&h->refcnt); + kref_put(&h->ref, cd->cache_put); } extern void cache_init(struct cache_head *h); diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 380152603d1e..4d7eb9e704da 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -89,13 +89,11 @@ static void rsi_free(struct rsi *rsii) kfree(rsii->out_token.data); } -static void rsi_put(struct cache_head *item, struct cache_detail *cd) +static void rsi_put(struct kref *ref) { - struct rsi *rsii = container_of(item, struct rsi, h); - if (cache_put(item, cd)) { - rsi_free(rsii); - kfree(rsii); - } + struct rsi *rsii = container_of(ref, struct rsi, h.ref); + rsi_free(rsii); + kfree(rsii); } static inline int rsi_hash(struct rsi *item) @@ -267,7 +265,7 @@ static int rsi_parse(struct cache_detail *cd, out: rsi_free(&rsii); if (rsip) - rsi_put(&rsip->h, &rsi_cache); + cache_put(&rsip->h, &rsi_cache); else status = -ENOMEM; return status; @@ -357,14 +355,12 @@ static void rsc_free(struct rsc *rsci) put_group_info(rsci->cred.cr_group_info); } -static void rsc_put(struct cache_head *item, struct cache_detail *cd) +static void rsc_put(struct kref *ref) { - struct rsc *rsci = container_of(item, struct rsc, h); + struct rsc *rsci = container_of(ref, struct rsc, h.ref); - if (cache_put(item, cd)) { - rsc_free(rsci); - kfree(rsci); - } + rsc_free(rsci); + kfree(rsci); } static inline int @@ -509,7 +505,7 @@ static int rsc_parse(struct cache_detail *cd, out: rsc_free(&rsci); if (rscp) - rsc_put(&rscp->h, &rsc_cache); + cache_put(&rscp->h, &rsc_cache); else status = -ENOMEM; return status; @@ -1076,7 +1072,7 @@ drop: ret = SVC_DROP; out: if (rsci) - rsc_put(&rsci->h, &rsc_cache); + cache_put(&rsci->h, &rsc_cache); return ret; } @@ -1168,7 +1164,7 @@ out_err: put_group_info(rqstp->rq_cred.cr_group_info); rqstp->rq_cred.cr_group_info = NULL; if (gsd->rsci) - rsc_put(&gsd->rsci->h, &rsc_cache); + cache_put(&gsd->rsci->h, &rsc_cache); gsd->rsci = NULL; return stat; diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index edcda4fd88e8..dd81e5928172 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -42,7 +42,7 @@ void cache_init(struct cache_head *h) time_t now = get_seconds(); h->next = NULL; h->flags = 0; - atomic_set(&h->refcnt, 1); + kref_init(&h->ref); h->expiry_time = now + CACHE_NEW_EXPIRY; h->last_refresh = now; } @@ -81,7 +81,7 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, if (detail->match(tmp, key)) { cache_get(tmp); write_unlock(&detail->hash_lock); - detail->cache_put(new, detail); + cache_put(new, detail); return tmp; } } @@ -145,7 +145,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, /* We need to insert a new entry */ tmp = detail->alloc(); if (!tmp) { - detail->cache_put(old, detail); + cache_put(old, detail); return NULL; } cache_init(tmp); @@ -165,7 +165,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, write_unlock(&detail->hash_lock); cache_fresh_unlocked(tmp, detail, is_new); cache_fresh_unlocked(old, detail, 0); - detail->cache_put(old, detail); + cache_put(old, detail); return tmp; } EXPORT_SYMBOL(sunrpc_cache_update); @@ -234,7 +234,7 @@ int cache_check(struct cache_detail *detail, cache_defer_req(rqstp, h); if (rv) - detail->cache_put(h, detail); + cache_put(h, detail); return rv; } @@ -431,7 +431,7 @@ static int cache_clean(void) if (test_and_clear_bit(CACHE_PENDING, &ch->flags)) queue_loose(current_detail, ch); - if (atomic_read(&ch->refcnt) == 1) + if (atomic_read(&ch->ref.refcount) == 1) break; } if (ch) { @@ -446,7 +446,7 @@ static int cache_clean(void) current_index ++; spin_unlock(&cache_list_lock); if (ch) - d->cache_put(ch, d); + cache_put(ch, d); } else spin_unlock(&cache_list_lock); @@ -723,7 +723,7 @@ cache_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) !test_bit(CACHE_PENDING, &rq->item->flags)) { list_del(&rq->q.list); spin_unlock(&queue_lock); - cd->cache_put(rq->item, cd); + cache_put(rq->item, cd); kfree(rq->buf); kfree(rq); } else @@ -906,7 +906,7 @@ static void queue_loose(struct cache_detail *detail, struct cache_head *ch) continue; list_del(&cr->q.list); spin_unlock(&queue_lock); - detail->cache_put(cr->item, detail); + cache_put(cr->item, detail); kfree(cr->buf); kfree(cr); return; @@ -1192,7 +1192,7 @@ static int c_show(struct seq_file *m, void *p) ifdebug(CACHE) seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n", - cp->expiry_time, atomic_read(&cp->refcnt), cp->flags); + cp->expiry_time, atomic_read(&cp->ref.refcount), cp->flags); cache_get(cp); if (cache_check(cd, cp, NULL)) /* cache_check does a cache_put on failure */ diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 7e38621a20b7..11020c0b7db5 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -84,15 +84,15 @@ struct ip_map { }; static struct cache_head *ip_table[IP_HASHMAX]; -static void ip_map_put(struct cache_head *item, struct cache_detail *cd) +static void ip_map_put(struct kref *kref) { + struct cache_head *item = container_of(kref, struct cache_head, ref); struct ip_map *im = container_of(item, struct ip_map,h); - if (cache_put(item, cd)) { - if (test_bit(CACHE_VALID, &item->flags) && - !test_bit(CACHE_NEGATIVE, &item->flags)) - auth_domain_put(&im->m_client->h); - kfree(im); - } + + if (test_bit(CACHE_VALID, &item->flags) && + !test_bit(CACHE_NEGATIVE, &item->flags)) + auth_domain_put(&im->m_client->h); + kfree(im); } #if IP_HASHBITS == 8 @@ -315,7 +315,7 @@ static int ip_map_update(struct ip_map *ipm, struct unix_domain *udom, time_t ex hash_ip((unsigned long)ipm->m_addr.s_addr)); if (!ch) return -ENOMEM; - ip_map_put(ch, &ip_map_cache); + cache_put(ch, &ip_map_cache); return 0; } @@ -369,7 +369,7 @@ struct auth_domain *auth_unix_lookup(struct in_addr addr) rv = &ipm->m_client->h; kref_get(&rv->ref); } - ip_map_put(&ipm->h, &ip_map_cache); + cache_put(&ipm->h, &ip_map_cache); return rv; } @@ -403,7 +403,7 @@ svcauth_unix_set_client(struct svc_rqst *rqstp) case 0: rqstp->rq_client = &ipm->m_client->h; kref_get(&rqstp->rq_client->ref); - ip_map_put(&ipm->h, &ip_map_cache); + cache_put(&ipm->h, &ip_map_cache); break; } return SVC_OK; From 74cae61ab45f19a3e8c4d9f53c0e94df129c7915 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 27 Mar 2006 01:15:10 -0800 Subject: [PATCH 080/306] [PATCH] fs/nfsd/export.c,net/sunrpc/cache.c: make needlessly global code static We can now make some code static. Signed-off-by: Adrian Bunk Cc: Neil Brown Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/export.c | 13 ++++++++----- include/linux/nfsd/export.h | 5 +---- include/linux/sunrpc/cache.h | 1 - net/sunrpc/cache.c | 2 +- net/sunrpc/sunrpc_syms.c | 1 - 5 files changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index cc811a1094cb..c340be0a3f59 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -57,7 +57,7 @@ static int exp_verify_string(char *cp, int max); #define EXPKEY_HASHMASK (EXPKEY_HASHMAX -1) static struct cache_head *expkey_table[EXPKEY_HASHMAX]; -void expkey_put(struct kref *ref) +static void expkey_put(struct kref *ref) { struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref); @@ -87,6 +87,8 @@ static void expkey_request(struct cache_detail *cd, static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old); static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *); +static struct cache_detail svc_expkey_cache; + static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) { /* client fsidtype fsid [path] */ @@ -255,7 +257,7 @@ static struct cache_head *expkey_alloc(void) return NULL; } -struct cache_detail svc_expkey_cache = { +static struct cache_detail svc_expkey_cache = { .owner = THIS_MODULE, .hash_size = EXPKEY_HASHMAX, .hash_table = expkey_table, @@ -345,7 +347,8 @@ static void svc_export_request(struct cache_detail *cd, (*bpp)[-1] = '\n'; } -struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); +static struct svc_export *svc_export_update(struct svc_export *new, + struct svc_export *old); static struct svc_export *svc_export_lookup(struct svc_export *); static int check_export(struct inode *inode, int flags) @@ -574,7 +577,7 @@ svc_export_lookup(struct svc_export *exp) return NULL; } -struct svc_export * +static struct svc_export * svc_export_update(struct svc_export *new, struct svc_export *old) { struct cache_head *ch; @@ -593,7 +596,7 @@ svc_export_update(struct svc_export *new, struct svc_export *old) } -struct svc_expkey * +static struct svc_expkey * exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp) { struct svc_expkey key, *ek; diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h index a6c08a47b25c..d2a8abb5011a 100644 --- a/include/linux/nfsd/export.h +++ b/include/linux/nfsd/export.h @@ -86,9 +86,6 @@ void nfsd_export_shutdown(void); void nfsd_export_flush(void); void exp_readlock(void); void exp_readunlock(void); -struct svc_expkey * exp_find_key(struct auth_domain *clp, - int fsid_type, u32 *fsidv, - struct cache_req *reqp); struct svc_export * exp_get_by_name(struct auth_domain *clp, struct vfsmount *mnt, struct dentry *dentry, @@ -102,7 +99,7 @@ int exp_rootfh(struct auth_domain *, int exp_pseudoroot(struct auth_domain *, struct svc_fh *fhp, struct cache_req *creq); int nfserrno(int errno); -extern struct cache_detail svc_export_cache, svc_expkey_cache; +extern struct cache_detail svc_export_cache; static inline void exp_put(struct svc_export *exp) { diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index ad3f5cbdb770..b5612c958cce 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -163,7 +163,6 @@ static inline void cache_put(struct cache_head *h, struct cache_detail *cd) kref_put(&h->ref, cd->cache_put); } -extern void cache_init(struct cache_head *h); extern int cache_check(struct cache_detail *detail, struct cache_head *h, struct cache_req *rqstp); extern void cache_flush(void); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index dd81e5928172..3ac4193a78ed 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -37,7 +37,7 @@ static void cache_defer_req(struct cache_req *req, struct cache_head *item); static void cache_revisit_request(struct cache_head *item); -void cache_init(struct cache_head *h) +static void cache_init(struct cache_head *h) { time_t now = get_seconds(); h->next = NULL; diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 69b8238f3d10..769114f0f886 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -105,7 +105,6 @@ EXPORT_SYMBOL(auth_unix_lookup); EXPORT_SYMBOL(cache_check); EXPORT_SYMBOL(cache_flush); EXPORT_SYMBOL(cache_purge); -EXPORT_SYMBOL(cache_init); EXPORT_SYMBOL(cache_register); EXPORT_SYMBOL(cache_unregister); EXPORT_SYMBOL(qword_add); From ad1b5229def92b71631a927895b034ceec06c991 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:15:11 -0800 Subject: [PATCH 081/306] [PATCH] knfsd: Tidy up unix_domain_find We shouldn't really compare &new->h with anything when new ==NULL, and gather three different if statements that all start if (rv ... into one large if. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/sunrpc/svcauth_unix.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 11020c0b7db5..7e5707e2d6b6 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -36,16 +36,16 @@ struct auth_domain *unix_domain_find(char *name) rv = auth_domain_lookup(name, NULL); while(1) { - if (rv != &new->h) { - if (new) auth_domain_put(&new->h); + if (rv) { + if (new && rv != &new->h) + auth_domain_put(&new->h); + + if (rv->flavour != &svcauth_unix) { + auth_domain_put(rv); + return NULL; + } return rv; } - if (rv && rv->flavour != &svcauth_unix) { - auth_domain_put(rv); - return NULL; - } - if (rv) - return rv; new = kmalloc(sizeof(*new), GFP_KERNEL); if (new == NULL) From 491214716755894986b99e0cf0a08b830d244577 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:15:12 -0800 Subject: [PATCH 082/306] [PATCH] knfsd: Update rpc-cache.txt to match recent changes Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/rpc-cache.txt | 121 ++++++++++++++++++++++-------------- 1 file changed, 76 insertions(+), 45 deletions(-) diff --git a/Documentation/rpc-cache.txt b/Documentation/rpc-cache.txt index 2b5d4434fa5a..5f757c8cf979 100644 --- a/Documentation/rpc-cache.txt +++ b/Documentation/rpc-cache.txt @@ -1,4 +1,4 @@ -This document gives a brief introduction to the caching + This document gives a brief introduction to the caching mechanisms in the sunrpc layer that is used, in particular, for NFS authentication. @@ -25,25 +25,17 @@ The common code handles such things as: - supporting 'NEGATIVE' as well as positive entries - allowing an EXPIRED time on cache items, and removing items after they expire, and are no longe in-use. - - Future code extensions are expect to handle - making requests to user-space to fill in cache entries - allowing user-space to directly set entries in the cache - delaying RPC requests that depend on as-yet incomplete cache entries, and replaying those requests when the cache entry is complete. - - maintaining last-access times on cache entries - - clean out old entries when the caches become full - -The code for performing a cache lookup is also common, but in the form -of a template. i.e. a #define. -Each cache defines a lookup function by using the DefineCacheLookup -macro, or the simpler DefineSimpleCacheLookup macro + - clean out old entries as they expire. Creating a Cache ---------------- -1/ A cache needs a datum to cache. This is in the form of a +1/ A cache needs a datum to store. This is in the form of a structure definition that must contain a struct cache_head as an element, usually the first. @@ -51,35 +43,69 @@ Creating a Cache Each cache element is reference counted and contains expiry and update times for use in cache management. 2/ A cache needs a "cache_detail" structure that - describes the cache. This stores the hash table, and some - parameters for cache management. -3/ A cache needs a lookup function. This is created using - the DefineCacheLookup macro. This lookup function is used both - to find entries and to update entries. The normal mode for - updating an entry is to replace the old entry with a new - entry. However it is possible to allow update-in-place - for those caches where it makes sense (no atomicity issues - or indirect reference counting issue) -4/ A cache needs to be registered using cache_register(). This - includes in on a list of caches that will be regularly - cleaned to discard old data. For this to work, some - thread must periodically call cache_clean - + describes the cache. This stores the hash table, some + parameters for cache management, and some operations detailing how + to work with particular cache items. + The operations requires are: + struct cache_head *alloc(void) + This simply allocates appropriate memory and returns + a pointer to the cache_detail embedded within the + structure + void cache_put(struct kref *) + This is called when the last reference to an item is + is dropped. The pointer passed is to the 'ref' field + in the cache_head. cache_put should release any + references create by 'cache_init' and, if CACHE_VALID + is set, any references created by cache_update. + It should then release the memory allocated by + 'alloc'. + int match(struct cache_head *orig, struct cache_head *new) + test if the keys in the two structures match. Return + 1 if they do, 0 if they don't. + void init(struct cache_head *orig, struct cache_head *new) + Set the 'key' fields in 'new' from 'orig'. This may + include taking references to shared objects. + void update(struct cache_head *orig, struct cache_head *new) + Set the 'content' fileds in 'new' from 'orig'. + int cache_show(struct seq_file *m, struct cache_detail *cd, + struct cache_head *h) + Optional. Used to provide a /proc file that lists the + contents of a cache. This should show one item, + usually on just one line. + int cache_request(struct cache_detail *cd, struct cache_head *h, + char **bpp, int *blen) + Format a request to be send to user-space for an item + to be instantiated. *bpp is a buffer of size *blen. + bpp should be moved forward over the encoded message, + and *blen should be reduced to show how much free + space remains. Return 0 on success or <0 if not + enough room or other problem. + int cache_parse(struct cache_detail *cd, char *buf, int len) + A message from user space has arrived to fill out a + cache entry. It is in 'buf' of length 'len'. + cache_parse should parse this, find the item in the + cache with sunrpc_cache_lookup, and update the item + with sunrpc_cache_update. + + +3/ A cache needs to be registered using cache_register(). This + includes it on a list of caches that will be regularly + cleaned to discard old data. + Using a cache ------------- -To find a value in a cache, call the lookup function passing it a the -datum which contains key, and possibly content, and a flag saying -whether to update the cache with new data from the datum. Depending -on how the cache lookup function was defined, it may take an extra -argument to identify the particular cache in question. +To find a value in a cache, call sunrpc_cache_lookup passing a pointer +to the cache_head in a sample item with the 'key' fields filled in. +This will be passed to ->match to identify the target entry. If no +entry is found, a new entry will be create, added to the cache, and +marked as not containing valid data. -Except in cases of kmalloc failure, the lookup function -will return a new datum which will store the key and -may contain valid content, or may not. -This datum is typically passed to cache_check which determines the -validity of the datum and may later initiate an upcall to fill -in the data. +The item returned is typically passed to cache_check which will check +if the data is valid, and may initiate an up-call to get fresh data. +cache_check will return -ENOENT in the entry is negative or if an up +call is needed but not possible, -EAGAIN if an upcall is pending, +or 0 if the data is valid; cache_check can be passed a "struct cache_req *". This structure is typically embedded in the actual request and can be used to create a @@ -90,6 +116,13 @@ item does become valid, the deferred copy of the request will be revisited (->revisit). It is expected that this method will reschedule the request for processing. +The value returned by sunrpc_cache_lookup can also be passed to +sunrpc_cache_update to set the content for the item. A second item is +passed which should hold the content. If the item found by _lookup +has valid data, then it is discarded and a new item is created. This +saves any user of an item from worrying about content changing while +it is being inspected. If the item found by _lookup does not contain +valid data, then the content is copied across and CACHE_VALID is set. Populating a cache ------------------ @@ -114,8 +147,8 @@ should be create or updated to have the given content, and the expiry time should be set on that item. Reading from a channel is a bit more interesting. When a cache -lookup fail, or when it suceeds but finds an entry that may soon -expiry, a request is lodged for that cache item to be updated by +lookup fails, or when it succeeds but finds an entry that may soon +expire, a request is lodged for that cache item to be updated by user-space. These requests appear in the channel file. Successive reads will return successive requests. @@ -130,7 +163,7 @@ Thus a user-space helper is likely to: write a response loop. -If it dies and needs to be restarted, any requests that have not be +If it dies and needs to be restarted, any requests that have not been answered will still appear in the file and will be read by the new instance of the helper. @@ -142,10 +175,9 @@ Each cache should also define a "cache_request" method which takes a cache item and encodes a request into the buffer provided. - Note: If a cache has no active readers on the channel, and has had not active readers for more than 60 seconds, further requests will not be -added to the channel but instead all looks that do not find a valid +added to the channel but instead all lookups that do not find a valid entry will fail. This is partly for backward compatibility: The previous nfs exports table was deemed to be authoritative and a failed lookup meant a definite 'no'. @@ -154,18 +186,17 @@ request/response format ----------------------- While each cache is free to use it's own format for requests -and responses over channel, the following is recommended are +and responses over channel, the following is recommended as appropriate and support routines are available to help: Each request or response record should be printable ASCII with precisely one newline character which should be at the end. Fields within the record should be separated by spaces, normally one. If spaces, newlines, or nul characters are needed in a field they -much be quotes. two mechanisms are available: +much be quoted. two mechanisms are available: 1/ If a field begins '\x' then it must contain an even number of hex digits, and pairs of these digits provide the bytes in the field. 2/ otherwise a \ in the field must be followed by 3 octal digits which give the code for a byte. Other characters are treated - as them selves. At the very least, space, newlines nul, and + as them selves. At the very least, space, newline, nul, and '\' must be quoted in this way. - From 013d3868143387be99bb0373d49452eaa3c55d41 Mon Sep 17 00:00:00 2001 From: Martin Andersson Date: Mon, 27 Mar 2006 01:15:18 -0800 Subject: [PATCH 083/306] [PATCH] sched: fix task interactivity calculation Is a truncation error in kernel/sched.c triggered when the nice value is negative. The affected code is used in the TASK_INTERACTIVE macro. The code is: #define SCALE(v1,v1_max,v2_max) \ (v1) * (v2_max) / (v1_max) which is used in this way: SCALE(TASK_NICE(p), 40, MAX_BONUS) Comments in the code says: * This part scales the interactivity limit depending on niceness. * * We scale it linearly, offset by the INTERACTIVE_DELTA delta. * Here are a few examples of different nice levels: * * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] * * (the X axis represents the possible -5 ... 0 ... +5 dynamic * priority range a task can explore, a value of '1' means the * task is rated interactive.) However, the current code does not scale it linearly and the result differs from the given examples. If the mathematical function "floor" is used when the nice value is negative instead of the truncation one gets when using integer division, the result conforms to the documentation. Output of TASK_INTERACTIVE when using the kernel code: nice dynamic priorities -20 1 1 1 1 1 1 1 1 1 0 0 -19 1 1 1 1 1 1 1 1 0 0 0 -18 1 1 1 1 1 1 1 1 0 0 0 -17 1 1 1 1 1 1 1 1 0 0 0 -16 1 1 1 1 1 1 1 1 0 0 0 -15 1 1 1 1 1 1 1 0 0 0 0 -14 1 1 1 1 1 1 1 0 0 0 0 -13 1 1 1 1 1 1 1 0 0 0 0 -12 1 1 1 1 1 1 1 0 0 0 0 -11 1 1 1 1 1 1 0 0 0 0 0 -10 1 1 1 1 1 1 0 0 0 0 0 -9 1 1 1 1 1 1 0 0 0 0 0 -8 1 1 1 1 1 1 0 0 0 0 0 -7 1 1 1 1 1 0 0 0 0 0 0 -6 1 1 1 1 1 0 0 0 0 0 0 -5 1 1 1 1 1 0 0 0 0 0 0 -4 1 1 1 1 1 0 0 0 0 0 0 -3 1 1 1 1 0 0 0 0 0 0 0 -2 1 1 1 1 0 0 0 0 0 0 0 -1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 2 1 1 1 1 0 0 0 0 0 0 0 3 1 1 1 1 0 0 0 0 0 0 0 4 1 1 1 0 0 0 0 0 0 0 0 5 1 1 1 0 0 0 0 0 0 0 0 6 1 1 1 0 0 0 0 0 0 0 0 7 1 1 1 0 0 0 0 0 0 0 0 8 1 1 0 0 0 0 0 0 0 0 0 9 1 1 0 0 0 0 0 0 0 0 0 10 1 1 0 0 0 0 0 0 0 0 0 11 1 1 0 0 0 0 0 0 0 0 0 12 1 0 0 0 0 0 0 0 0 0 0 13 1 0 0 0 0 0 0 0 0 0 0 14 1 0 0 0 0 0 0 0 0 0 0 15 1 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 17 0 0 0 0 0 0 0 0 0 0 0 18 0 0 0 0 0 0 0 0 0 0 0 19 0 0 0 0 0 0 0 0 0 0 0 Output of TASK_INTERACTIVE when using "floor" nice dynamic priorities -20 1 1 1 1 1 1 1 1 1 0 0 -19 1 1 1 1 1 1 1 1 1 0 0 -18 1 1 1 1 1 1 1 1 1 0 0 -17 1 1 1 1 1 1 1 1 1 0 0 -16 1 1 1 1 1 1 1 1 0 0 0 -15 1 1 1 1 1 1 1 1 0 0 0 -14 1 1 1 1 1 1 1 1 0 0 0 -13 1 1 1 1 1 1 1 1 0 0 0 -12 1 1 1 1 1 1 1 0 0 0 0 -11 1 1 1 1 1 1 1 0 0 0 0 -10 1 1 1 1 1 1 1 0 0 0 0 -9 1 1 1 1 1 1 1 0 0 0 0 -8 1 1 1 1 1 1 0 0 0 0 0 -7 1 1 1 1 1 1 0 0 0 0 0 -6 1 1 1 1 1 1 0 0 0 0 0 -5 1 1 1 1 1 1 0 0 0 0 0 -4 1 1 1 1 1 0 0 0 0 0 0 -3 1 1 1 1 1 0 0 0 0 0 0 -2 1 1 1 1 1 0 0 0 0 0 0 -1 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 2 1 1 1 1 0 0 0 0 0 0 0 3 1 1 1 1 0 0 0 0 0 0 0 4 1 1 1 0 0 0 0 0 0 0 0 5 1 1 1 0 0 0 0 0 0 0 0 6 1 1 1 0 0 0 0 0 0 0 0 7 1 1 1 0 0 0 0 0 0 0 0 8 1 1 0 0 0 0 0 0 0 0 0 9 1 1 0 0 0 0 0 0 0 0 0 10 1 1 0 0 0 0 0 0 0 0 0 11 1 1 0 0 0 0 0 0 0 0 0 12 1 0 0 0 0 0 0 0 0 0 0 13 1 0 0 0 0 0 0 0 0 0 0 14 1 0 0 0 0 0 0 0 0 0 0 15 1 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 17 0 0 0 0 0 0 0 0 0 0 0 18 0 0 0 0 0 0 0 0 0 0 0 19 0 0 0 0 0 0 0 0 0 0 0 Signed-off-by: Martin Andersson Acked-by: Ingo Molnar Cc: Nick Piggin Cc: Mike Galbraith Cc: Peter Williams Cc: Con Kolivas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index 78acdefeccca..dc599c85a88d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -145,7 +145,8 @@ (v1) * (v2_max) / (v1_max) #define DELTA(p) \ - (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) + (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ + INTERACTIVE_DELTA) #define TASK_INTERACTIVE(p) \ ((p)->prio <= (p)->static_prio - DELTA(p)) From 77e4bfbcf071f795b54862455dce8902b3fc29c2 Mon Sep 17 00:00:00 2001 From: Andreas Mohr Date: Mon, 27 Mar 2006 01:15:20 -0800 Subject: [PATCH 084/306] [PATCH] Small schedule() optimization small schedule() microoptimization. Signed-off-by: Andreas Mohr Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index dc599c85a88d..a96a05d23262 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2879,13 +2879,11 @@ asmlinkage void __sched schedule(void) * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ - if (likely(!current->exit_state)) { - if (unlikely(in_atomic())) { - printk(KERN_ERR "BUG: scheduling while atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), current->pid); - dump_stack(); - } + if (unlikely(in_atomic() && !current->exit_state)) { + printk(KERN_ERR "BUG: scheduling while atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), current->pid); + dump_stack(); } profile_hit(SCHED_PROFILING, __builtin_return_address(0)); From 1e9f28fa1eb9773bf65bae08288c6a0a38eef4a7 Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Mon, 27 Mar 2006 01:15:22 -0800 Subject: [PATCH 085/306] [PATCH] sched: new sched domain for representing multi-core Add a new sched domain for representing multi-core with shared caches between cores. Consider a dual package system, each package containing two cores and with last level cache shared between cores with in a package. If there are two runnable processes, with this appended patch those two processes will be scheduled on different packages. On such systems, with this patch we have observed 8% perf improvement with specJBB(2 warehouse) benchmark and 35% improvement with CFP2000 rate(with 2 users). This new domain will come into play only on multi-core systems with shared caches. On other systems, this sched domain will be removed by domain degeneration code. This new domain can be also used for implementing power savings policy (see OLS 2005 CMP kernel scheduler paper for more details.. I will post another patch for power savings policy soon) Most of the arch/* file changes are for cpu_coregroup_map() implementation. Signed-off-by: Suresh Siddha Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/Kconfig | 9 ++++ arch/i386/kernel/cpu/common.c | 10 ++-- arch/i386/kernel/cpu/intel_cacheinfo.c | 22 +++++++- arch/i386/kernel/smpboot.c | 24 +++++++++ arch/x86_64/Kconfig | 9 ++++ arch/x86_64/kernel/setup.c | 3 +- arch/x86_64/kernel/smpboot.c | 24 +++++++++ include/asm-i386/processor.h | 5 ++ include/asm-i386/topology.h | 2 + include/asm-x86_64/processor.h | 4 ++ include/asm-x86_64/smp.h | 1 + include/asm-x86_64/topology.h | 2 + include/linux/topology.h | 9 ++++ kernel/sched.c | 73 ++++++++++++++++++++++++-- 14 files changed, 186 insertions(+), 11 deletions(-) diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index f7db71d0b913..f17bd1d2707e 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -231,6 +231,15 @@ config SCHED_SMT cost of slightly increased overhead in some places. If unsure say N here. +config SCHED_MC + bool "Multi-core scheduler support" + depends on SMP + default y + help + Multi-core scheduler support improves the CPU scheduler's decision + making when dealing with multi-core CPU chips at a cost of slightly + increased overhead in some places. If unsure say N here. + source "kernel/Kconfig.preempt" config X86_UP_APIC diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 7e3d6b6a4e96..a06a49075f10 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -266,7 +266,7 @@ static void __init early_cpu_detect(void) void __cpuinit generic_identify(struct cpuinfo_x86 * c) { u32 tfms, xlvl; - int junk; + int ebx; if (have_cpuid_p()) { /* Get vendor name */ @@ -282,7 +282,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c) /* Intel-defined flags: level 0x00000001 */ if ( c->cpuid_level >= 0x00000001 ) { u32 capability, excap; - cpuid(0x00000001, &tfms, &junk, &excap, &capability); + cpuid(0x00000001, &tfms, &ebx, &excap, &capability); c->x86_capability[0] = capability; c->x86_capability[4] = excap; c->x86 = (tfms >> 8) & 15; @@ -292,6 +292,11 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c) if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xF) << 4; c->x86_mask = tfms & 15; +#ifdef CONFIG_SMP + c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0); +#else + c->apicid = (ebx >> 24) & 0xFF; +#endif } else { /* Have CPUID level 0 only - unheard of */ c->x86 = 4; @@ -474,7 +479,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) cpuid(1, &eax, &ebx, &ecx, &edx); - c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0); if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) return; diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c index ce61921369e5..7e7fd4e67dd0 100644 --- a/arch/i386/kernel/cpu/intel_cacheinfo.c +++ b/arch/i386/kernel/cpu/intel_cacheinfo.c @@ -173,6 +173,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */ unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ + unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; +#ifdef CONFIG_SMP + unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data); +#endif if (c->cpuid_level > 3) { static int is_initialized; @@ -205,9 +209,15 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) break; case 2: new_l2 = this_leaf.size/1024; + num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + l2_id = c->apicid >> index_msb; break; case 3: new_l3 = this_leaf.size/1024; + num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + l3_id = c->apicid >> index_msb; break; default: break; @@ -273,11 +283,19 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) if (new_l1i) l1i = new_l1i; - if (new_l2) + if (new_l2) { l2 = new_l2; +#ifdef CONFIG_SMP + cpu_llc_id[cpu] = l2_id; +#endif + } - if (new_l3) + if (new_l3) { l3 = new_l3; +#ifdef CONFIG_SMP + cpu_llc_id[cpu] = l3_id; +#endif + } if ( trace ) printk (KERN_INFO "CPU: Trace cache: %dK uops", trace); diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 82371d83bfa9..a6969903f2d6 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -72,6 +72,9 @@ int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID}; /* Core ID of each logical CPU */ int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID}; +/* Last level cache ID of each logical CPU */ +int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; + /* representing HT siblings of each logical CPU */ cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; EXPORT_SYMBOL(cpu_sibling_map); @@ -440,6 +443,18 @@ static void __devinit smp_callin(void) static int cpucount; +/* maps the cpu to the sched domain representing multi-core */ +cpumask_t cpu_coregroup_map(int cpu) +{ + struct cpuinfo_x86 *c = cpu_data + cpu; + /* + * For perf, we return last level cache shared map. + * TBD: when power saving sched policy is added, we will return + * cpu_core_map when power saving policy is enabled + */ + return c->llc_shared_map; +} + /* representing cpus for which sibling maps can be computed */ static cpumask_t cpu_sibling_setup_map; @@ -459,12 +474,16 @@ set_cpu_sibling_map(int cpu) cpu_set(cpu, cpu_sibling_map[i]); cpu_set(i, cpu_core_map[cpu]); cpu_set(cpu, cpu_core_map[i]); + cpu_set(i, c[cpu].llc_shared_map); + cpu_set(cpu, c[i].llc_shared_map); } } } else { cpu_set(cpu, cpu_sibling_map[cpu]); } + cpu_set(cpu, c[cpu].llc_shared_map); + if (current_cpu_data.x86_max_cores == 1) { cpu_core_map[cpu] = cpu_sibling_map[cpu]; c[cpu].booted_cores = 1; @@ -472,6 +491,11 @@ set_cpu_sibling_map(int cpu) } for_each_cpu_mask(i, cpu_sibling_setup_map) { + if (cpu_llc_id[cpu] != BAD_APICID && + cpu_llc_id[cpu] == cpu_llc_id[i]) { + cpu_set(i, c[cpu].llc_shared_map); + cpu_set(cpu, c[i].llc_shared_map); + } if (phys_proc_id[cpu] == phys_proc_id[i]) { cpu_set(i, cpu_core_map[cpu]); cpu_set(cpu, cpu_core_map[i]); diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 45efe0ca88f8..1cb4aa241c8c 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -250,6 +250,15 @@ config SCHED_SMT cost of slightly increased overhead in some places. If unsure say N here. +config SCHED_MC + bool "Multi-core scheduler support" + depends on SMP + default y + help + Multi-core scheduler support improves the CPU scheduler's decision + making when dealing with multi-core CPU chips at a cost of slightly + increased overhead in some places. If unsure say N here. + source "kernel/Kconfig.preempt" config NUMA diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index a57eec8311a7..d1f3e9272c05 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -962,7 +962,6 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c) cpuid(1, &eax, &ebx, &ecx, &edx); - c->apicid = phys_pkg_id(0); if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) return; @@ -1171,6 +1170,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86_capability[2] = cpuid_edx(0x80860001); } + c->apicid = phys_pkg_id(0); + /* * Vendor-specific initialization. In this section we * canonicalize the feature flags, meaning if there are diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 66e98659d077..ea48fa638070 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -68,6 +68,9 @@ u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; /* core ID of each logical CPU */ u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; +/* Last level cache ID of each logical CPU */ +u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; + /* Bitmask of currently online CPUs */ cpumask_t cpu_online_map __read_mostly; @@ -445,6 +448,18 @@ void __cpuinit smp_callin(void) cpu_set(cpuid, cpu_callin_map); } +/* maps the cpu to the sched domain representing multi-core */ +cpumask_t cpu_coregroup_map(int cpu) +{ + struct cpuinfo_x86 *c = cpu_data + cpu; + /* + * For perf, we return last level cache shared map. + * TBD: when power saving sched policy is added, we will return + * cpu_core_map when power saving policy is enabled + */ + return c->llc_shared_map; +} + /* representing cpus for which sibling maps can be computed */ static cpumask_t cpu_sibling_setup_map; @@ -463,12 +478,16 @@ static inline void set_cpu_sibling_map(int cpu) cpu_set(cpu, cpu_sibling_map[i]); cpu_set(i, cpu_core_map[cpu]); cpu_set(cpu, cpu_core_map[i]); + cpu_set(i, c[cpu].llc_shared_map); + cpu_set(cpu, c[i].llc_shared_map); } } } else { cpu_set(cpu, cpu_sibling_map[cpu]); } + cpu_set(cpu, c[cpu].llc_shared_map); + if (current_cpu_data.x86_max_cores == 1) { cpu_core_map[cpu] = cpu_sibling_map[cpu]; c[cpu].booted_cores = 1; @@ -476,6 +495,11 @@ static inline void set_cpu_sibling_map(int cpu) } for_each_cpu_mask(i, cpu_sibling_setup_map) { + if (cpu_llc_id[cpu] != BAD_APICID && + cpu_llc_id[cpu] == cpu_llc_id[i]) { + cpu_set(i, c[cpu].llc_shared_map); + cpu_set(cpu, c[i].llc_shared_map); + } if (phys_proc_id[cpu] == phys_proc_id[i]) { cpu_set(i, cpu_core_map[cpu]); cpu_set(cpu, cpu_core_map[i]); diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index feca5d961e2b..af4bfd012475 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -20,6 +20,7 @@ #include #include #include +#include /* flag for disabling the tsc */ extern int tsc_disable; @@ -67,6 +68,9 @@ struct cpuinfo_x86 { char pad0; int x86_power; unsigned long loops_per_jiffy; +#ifdef CONFIG_SMP + cpumask_t llc_shared_map; /* cpus sharing the last level cache */ +#endif unsigned char x86_max_cores; /* cpuid returned max cores value */ unsigned char booted_cores; /* number of cores as seen by OS */ unsigned char apicid; @@ -103,6 +107,7 @@ extern struct cpuinfo_x86 cpu_data[]; extern int phys_proc_id[NR_CPUS]; extern int cpu_core_id[NR_CPUS]; +extern int cpu_llc_id[NR_CPUS]; extern char ignore_fpu_irq; extern void identify_cpu(struct cpuinfo_x86 *); diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h index aa958c6ee83e..b94e5eeef917 100644 --- a/include/asm-i386/topology.h +++ b/include/asm-i386/topology.h @@ -112,4 +112,6 @@ extern unsigned long node_remap_size[]; #endif /* CONFIG_NUMA */ +extern cpumask_t cpu_coregroup_map(int cpu); + #endif /* _ASM_I386_TOPOLOGY_H */ diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h index 8c8d88c036ed..1aa2cee43344 100644 --- a/include/asm-x86_64/processor.h +++ b/include/asm-x86_64/processor.h @@ -20,6 +20,7 @@ #include #include #include +#include #define TF_MASK 0x00000100 #define IF_MASK 0x00000200 @@ -65,6 +66,9 @@ struct cpuinfo_x86 { __u32 x86_power; __u32 extended_cpuid_level; /* Max extended CPUID function supported */ unsigned long loops_per_jiffy; +#ifdef CONFIG_SMP + cpumask_t llc_shared_map; /* cpus sharing the last level cache */ +#endif __u8 apicid; __u8 booted_cores; /* number of cores as seen by OS */ } ____cacheline_aligned; diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h index 9ccbb2cfd5c0..a4fdaeb5c397 100644 --- a/include/asm-x86_64/smp.h +++ b/include/asm-x86_64/smp.h @@ -56,6 +56,7 @@ extern cpumask_t cpu_sibling_map[NR_CPUS]; extern cpumask_t cpu_core_map[NR_CPUS]; extern u8 phys_proc_id[NR_CPUS]; extern u8 cpu_core_id[NR_CPUS]; +extern u8 cpu_llc_id[NR_CPUS]; #define SMP_TRAMPOLINE_BASE 0x6000 diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h index c642f5d9882d..9db54e9d17bb 100644 --- a/include/asm-x86_64/topology.h +++ b/include/asm-x86_64/topology.h @@ -68,4 +68,6 @@ extern int __node_distance(int, int); #include +extern cpumask_t cpu_coregroup_map(int cpu); + #endif diff --git a/include/linux/topology.h b/include/linux/topology.h index e8eb0040ce3a..a305ae2e44b6 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -164,6 +164,15 @@ .nr_balance_failed = 0, \ } +#ifdef CONFIG_SCHED_MC +#ifndef SD_MC_INIT +/* for now its same as SD_CPU_INIT. + * TBD: Tune Domain parameters! + */ +#define SD_MC_INIT SD_CPU_INIT +#endif +#endif + #ifdef CONFIG_NUMA #ifndef SD_NODE_INIT #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!! diff --git a/kernel/sched.c b/kernel/sched.c index a96a05d23262..8a8b71b5751b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5574,11 +5574,31 @@ static int cpu_to_cpu_group(int cpu) } #endif +#ifdef CONFIG_SCHED_MC +static DEFINE_PER_CPU(struct sched_domain, core_domains); +static struct sched_group sched_group_core[NR_CPUS]; +#endif + +#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) +static int cpu_to_core_group(int cpu) +{ + return first_cpu(cpu_sibling_map[cpu]); +} +#elif defined(CONFIG_SCHED_MC) +static int cpu_to_core_group(int cpu) +{ + return cpu; +} +#endif + static DEFINE_PER_CPU(struct sched_domain, phys_domains); static struct sched_group sched_group_phys[NR_CPUS]; static int cpu_to_phys_group(int cpu) { -#ifdef CONFIG_SCHED_SMT +#if defined(CONFIG_SCHED_MC) + cpumask_t mask = cpu_coregroup_map(cpu); + return first_cpu(mask); +#elif defined(CONFIG_SCHED_SMT) return first_cpu(cpu_sibling_map[cpu]); #else return cpu; @@ -5676,6 +5696,17 @@ void build_sched_domains(const cpumask_t *cpu_map) sd->parent = p; sd->groups = &sched_group_phys[group]; +#ifdef CONFIG_SCHED_MC + p = sd; + sd = &per_cpu(core_domains, i); + group = cpu_to_core_group(i); + *sd = SD_MC_INIT; + sd->span = cpu_coregroup_map(i); + cpus_and(sd->span, sd->span, *cpu_map); + sd->parent = p; + sd->groups = &sched_group_core[group]; +#endif + #ifdef CONFIG_SCHED_SMT p = sd; sd = &per_cpu(cpu_domains, i); @@ -5701,6 +5732,19 @@ void build_sched_domains(const cpumask_t *cpu_map) } #endif +#ifdef CONFIG_SCHED_MC + /* Set up multi-core groups */ + for_each_cpu_mask(i, *cpu_map) { + cpumask_t this_core_map = cpu_coregroup_map(i); + cpus_and(this_core_map, this_core_map, *cpu_map); + if (i != first_cpu(this_core_map)) + continue; + init_sched_build_groups(sched_group_core, this_core_map, + &cpu_to_core_group); + } +#endif + + /* Set up physical groups */ for (i = 0; i < MAX_NUMNODES; i++) { cpumask_t nodemask = node_to_cpumask(i); @@ -5797,11 +5841,31 @@ void build_sched_domains(const cpumask_t *cpu_map) power = SCHED_LOAD_SCALE; sd->groups->cpu_power = power; #endif +#ifdef CONFIG_SCHED_MC + sd = &per_cpu(core_domains, i); + power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) + * SCHED_LOAD_SCALE / 10; + sd->groups->cpu_power = power; + sd = &per_cpu(phys_domains, i); + + /* + * This has to be < 2 * SCHED_LOAD_SCALE + * Lets keep it SCHED_LOAD_SCALE, so that + * while calculating NUMA group's cpu_power + * we can simply do + * numa_group->cpu_power += phys_group->cpu_power; + * + * See "only add power once for each physical pkg" + * comment below + */ + sd->groups->cpu_power = SCHED_LOAD_SCALE; +#else sd = &per_cpu(phys_domains, i); power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * (cpus_weight(sd->groups->cpumask)-1) / 10; sd->groups->cpu_power = power; +#endif #ifdef CONFIG_NUMA sd = &per_cpu(allnodes_domains, i); @@ -5823,7 +5887,6 @@ void build_sched_domains(const cpumask_t *cpu_map) next_sg: for_each_cpu_mask(j, sg->cpumask) { struct sched_domain *sd; - int power; sd = &per_cpu(phys_domains, j); if (j != first_cpu(sd->groups->cpumask)) { @@ -5833,10 +5896,8 @@ next_sg: */ continue; } - power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * - (cpus_weight(sd->groups->cpumask)-1) / 10; - sg->cpu_power += power; + sg->cpu_power += sd->groups->cpu_power; } sg = sg->next; if (sg != sched_group_nodes[i]) @@ -5849,6 +5910,8 @@ next_sg: struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); +#elif defined(CONFIG_SCHED_MC) + sd = &per_cpu(core_domains, i); #else sd = &per_cpu(phys_domains, i); #endif From 0806903316d516a3b3851c51cea5c71724d9051d Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Mon, 27 Mar 2006 01:15:23 -0800 Subject: [PATCH 086/306] [PATCH] sched: fix group power for allnodes_domains Current sched groups power calculation for allnodes_domains is wrong. We should really be using cumulative power of the physical packages in that group (similar to the calculation in node_domains) Signed-off-by: Suresh Siddha Cc: Ingo Molnar Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 62 +++++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 8a8b71b5751b..7854ee516b92 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5621,6 +5621,32 @@ static int cpu_to_allnodes_group(int cpu) { return cpu_to_node(cpu); } +static void init_numa_sched_groups_power(struct sched_group *group_head) +{ + struct sched_group *sg = group_head; + int j; + + if (!sg) + return; +next_sg: + for_each_cpu_mask(j, sg->cpumask) { + struct sched_domain *sd; + + sd = &per_cpu(phys_domains, j); + if (j != first_cpu(sd->groups->cpumask)) { + /* + * Only add "power" once for each + * physical package. + */ + continue; + } + + sg->cpu_power += sd->groups->cpu_power; + } + sg = sg->next; + if (sg != group_head) + goto next_sg; +} #endif /* @@ -5866,43 +5892,13 @@ void build_sched_domains(const cpumask_t *cpu_map) (cpus_weight(sd->groups->cpumask)-1) / 10; sd->groups->cpu_power = power; #endif - -#ifdef CONFIG_NUMA - sd = &per_cpu(allnodes_domains, i); - if (sd->groups) { - power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * - (cpus_weight(sd->groups->cpumask)-1) / 10; - sd->groups->cpu_power = power; - } -#endif } #ifdef CONFIG_NUMA - for (i = 0; i < MAX_NUMNODES; i++) { - struct sched_group *sg = sched_group_nodes[i]; - int j; + for (i = 0; i < MAX_NUMNODES; i++) + init_numa_sched_groups_power(sched_group_nodes[i]); - if (sg == NULL) - continue; -next_sg: - for_each_cpu_mask(j, sg->cpumask) { - struct sched_domain *sd; - - sd = &per_cpu(phys_domains, j); - if (j != first_cpu(sd->groups->cpumask)) { - /* - * Only add "power" once for each - * physical package. - */ - continue; - } - - sg->cpu_power += sd->groups->cpu_power; - } - sg = sg->next; - if (sg != sched_group_nodes[i]) - goto next_sg; - } + init_numa_sched_groups_power(sched_group_allnodes); #endif /* Attach the domains */ From b06be912a3ad68c69dba0ed6e92723140020e392 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 27 Mar 2006 01:15:24 -0800 Subject: [PATCH 087/306] [PATCH] x86: don't use cpuid.2 to determine cache info if cpuid.4 is supported Don't use cpuid.2 to determine cache info if cpuid.4 is supported. The exception is P4 trace cache. We always use cpuid.2 to get trace cache under P4. Signed-off-by: Shaohua Li Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/cpu/intel_cacheinfo.c | 85 +++++++++++++++----------- 1 file changed, 49 insertions(+), 36 deletions(-) diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c index 7e7fd4e67dd0..9df87b03612c 100644 --- a/arch/i386/kernel/cpu/intel_cacheinfo.c +++ b/arch/i386/kernel/cpu/intel_cacheinfo.c @@ -225,11 +225,19 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) } } } - if (c->cpuid_level > 1) { + /* + * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for + * trace cache + */ + if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) { /* supports eax=2 call */ int i, j, n; int regs[4]; unsigned char *dp = (unsigned char *)regs; + int only_trace = 0; + + if (num_cache_leaves != 0 && c->x86 == 15) + only_trace = 1; /* Number of times to iterate */ n = cpuid_eax(2) & 0xFF; @@ -251,6 +259,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) while (cache_table[k].descriptor != 0) { if (cache_table[k].descriptor == des) { + if (only_trace && cache_table[k].cache_type != LVL_TRACE) + break; switch (cache_table[k].cache_type) { case LVL_1_INST: l1i += cache_table[k].size; @@ -276,43 +286,46 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) } } } - - if (new_l1d) - l1d = new_l1d; - - if (new_l1i) - l1i = new_l1i; - - if (new_l2) { - l2 = new_l2; -#ifdef CONFIG_SMP - cpu_llc_id[cpu] = l2_id; -#endif - } - - if (new_l3) { - l3 = new_l3; -#ifdef CONFIG_SMP - cpu_llc_id[cpu] = l3_id; -#endif - } - - if ( trace ) - printk (KERN_INFO "CPU: Trace cache: %dK uops", trace); - else if ( l1i ) - printk (KERN_INFO "CPU: L1 I cache: %dK", l1i); - if ( l1d ) - printk(", L1 D cache: %dK\n", l1d); - else - printk("\n"); - if ( l2 ) - printk(KERN_INFO "CPU: L2 cache: %dK\n", l2); - if ( l3 ) - printk(KERN_INFO "CPU: L3 cache: %dK\n", l3); - - c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); } + if (new_l1d) + l1d = new_l1d; + + if (new_l1i) + l1i = new_l1i; + + if (new_l2) { + l2 = new_l2; +#ifdef CONFIG_SMP + cpu_llc_id[cpu] = l2_id; +#endif + } + + if (new_l3) { + l3 = new_l3; +#ifdef CONFIG_SMP + cpu_llc_id[cpu] = l3_id; +#endif + } + + if (trace) + printk (KERN_INFO "CPU: Trace cache: %dK uops", trace); + else if ( l1i ) + printk (KERN_INFO "CPU: L1 I cache: %dK", l1i); + + if (l1d) + printk(", L1 D cache: %dK\n", l1d); + else + printk("\n"); + + if (l2) + printk(KERN_INFO "CPU: L2 cache: %dK\n", l2); + + if (l3) + printk(KERN_INFO "CPU: L3 cache: %dK\n", l3); + + c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); + return l2; } From a117e66ed45ac0569c039ea60bd7a9a61e031858 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:25 -0800 Subject: [PATCH 088/306] [PATCH] unify pfn_to_page: generic functions There are 3 memory models, FLATMEM, DISCONTIGMEM, SPARSEMEM. Each arch has its own page_to_pfn(), pfn_to_page() for each models. But most of them can use the same arithmetic. This patch adds asm-generic/memory_model.h, which includes generic page_to_pfn(), pfn_to_page() definitions for each memory model. When CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y, out-of-line functions are used instead of macro. This is enabled by some archs and reduces text size. Signed-off-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Andi Kleen Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Russell King Cc: Ian Molton Cc: Mikael Starvik Cc: David Howells Cc: Yoshinori Sato Cc: Hirokazu Takata Cc: Ralf Baechle Cc: Kyle McMartin Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Paul Mundt Cc: Kazumoto Kojima Cc: Richard Curnow Cc: William Lee Irwin III Cc: "David S. Miller" Cc: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Cc: Miles Bader Cc: Chris Zankel Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/memory_model.h | 77 ++++++++++++++++++++++++++++++ include/asm-sparc64/page.h | 2 + include/linux/mmzone.h | 11 ----- mm/page_alloc.c | 42 ++++++++++++++++ 4 files changed, 121 insertions(+), 11 deletions(-) create mode 100644 include/asm-generic/memory_model.h diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h new file mode 100644 index 000000000000..a7bb4978e808 --- /dev/null +++ b/include/asm-generic/memory_model.h @@ -0,0 +1,77 @@ +#ifndef __ASM_MEMORY_MODEL_H +#define __ASM_MEMORY_MODEL_H + +#ifdef __KERNEL__ +#ifndef __ASSEMBLY__ + +#if defined(CONFIG_FLATMEM) + +#ifndef ARCH_PFN_OFFSET +#define ARCH_PFN_OFFSET (0UL) +#endif + +#elif defined(CONFIG_DISCONTIGMEM) + +#ifndef arch_pfn_to_nid +#define arch_pfn_to_nid(pfn) pfn_to_nid(pfn) +#endif + +#ifndef arch_local_page_offset +#define arch_local_page_offset(pfn, nid) \ + ((pfn) - NODE_DATA(nid)->node_start_pfn) +#endif + +#endif /* CONFIG_DISCONTIGMEM */ + +#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE +struct page; +/* this is useful when inlined pfn_to_page is too big */ +extern struct page *pfn_to_page(unsigned long pfn); +extern unsigned long page_to_pfn(struct page *page); +#else +/* + * supports 3 memory models. + */ +#if defined(CONFIG_FLATMEM) + +#define pfn_to_page(pfn) (mem_map + ((pfn) - ARCH_PFN_OFFSET)) +#define page_to_pfn(page) ((unsigned long)((page) - mem_map) + \ + ARCH_PFN_OFFSET) +#elif defined(CONFIG_DISCONTIGMEM) + +#define pfn_to_page(pfn) \ +({ unsigned long __pfn = (pfn); \ + unsigned long __nid = arch_pfn_to_nid(pfn); \ + NODE_DATA(__nid)->node_mem_map + arch_local_page_offset(__pfn, __nid);\ +}) + +#define page_to_pfn(pg) \ +({ struct page *__pg = (pg); \ + struct zone *__zone = page_zone(__pg); \ + (unsigned long)(__pg - __zone->zone_mem_map) + \ + __zone->zone_start_pfn; \ +}) + +#elif defined(CONFIG_SPARSEMEM) +/* + * Note: section's mem_map is encorded to reflect its start_pfn. + * section[i].section_mem_map == mem_map's address - start_pfn; + */ +#define page_to_pfn(pg) \ +({ struct page *__pg = (pg); \ + int __sec = page_to_section(__pg); \ + __pg - __section_mem_map_addr(__nr_to_section(__sec)); \ +}) + +#define pfn_to_page(pfn) \ +({ unsigned long __pfn = (pfn); \ + struct mem_section *__sec = __pfn_to_section(__pfn); \ + __section_mem_map_addr(__sec) + __pfn; \ +}) +#endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */ +#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ + +#endif /* __ASSEMBLY__ */ +#endif /* __KERNEL__ */ + +#endif diff --git a/include/asm-sparc64/page.h b/include/asm-sparc64/page.h index 66fe4ac59fd6..aabb21906724 100644 --- a/include/asm-sparc64/page.h +++ b/include/asm-sparc64/page.h @@ -111,6 +111,8 @@ typedef unsigned long pgprot_t; (_AC(0x0000000070000000,UL)) : \ (_AC(0xfffff80000000000,UL) + (1UL << 32UL))) +#include + #endif /* !(__ASSEMBLY__) */ /* to align the pointer to the (next) page boundary */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ebfc238cc243..0c1c0c0cce65 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -602,17 +602,6 @@ static inline struct mem_section *__pfn_to_section(unsigned long pfn) return __nr_to_section(pfn_to_section_nr(pfn)); } -#define pfn_to_page(pfn) \ -({ \ - unsigned long __pfn = (pfn); \ - __section_mem_map_addr(__pfn_to_section(__pfn)) + __pfn; \ -}) -#define page_to_pfn(page) \ -({ \ - page - __section_mem_map_addr(__nr_to_section( \ - page_to_section(page))); \ -}) - static inline int pfn_valid(unsigned long pfn) { if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 338a02bb004d..349b328763b7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2745,3 +2745,45 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } + +#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE +/* + * pfn <-> page translation. out-of-line version. + * (see asm-generic/memory_model.h) + */ +#if defined(CONFIG_FLATMEM) +struct page *pfn_to_page(unsigned long pfn) +{ + return mem_map + (pfn - ARCH_PFN_OFFSET); +} +unsigned long page_to_pfn(struct page *page) +{ + return (page - mem_map) + ARCH_PFN_OFFSET; +} +#elif defined(CONFIG_DISCONTIGMEM) +struct page *pfn_to_page(unsigned long pfn) +{ + int nid = arch_pfn_to_nid(pfn); + return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid); +} +unsigned long page_to_pfn(struct page *page) +{ + struct zone *zone = page_zone(page); + return (page - zone->zone_mem_map) + zone->zone_start_pfn; + +} +#elif defined(CONFIG_SPARSEMEM) +struct page *pfn_to_page(unsigned long pfn) +{ + return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn; +} + +unsigned long page_to_pfn(struct page *page) +{ + long section_id = page_to_section(page); + return page - __section_mem_map_addr(__nr_to_section(section_id)); +} +#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */ +EXPORT_SYMBOL(pfn_to_page); +EXPORT_SYMBOL(page_to_pfn); +#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ From ad658b385e6308066f9084a7ea01305b223cd3a0 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:33 -0800 Subject: [PATCH 089/306] [PATCH] unify pfn_to_page: i386 pfn_to_page i386 can use generic funcs. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-i386/mmzone.h | 17 ----------------- include/asm-i386/page.h | 3 +-- 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h index 74f595d80579..e33e9f9e4c66 100644 --- a/include/asm-i386/mmzone.h +++ b/include/asm-i386/mmzone.h @@ -70,8 +70,6 @@ static inline int pfn_to_nid(unsigned long pfn) #endif } -#define node_localnr(pfn, nid) ((pfn) - node_data[nid]->node_start_pfn) - /* * Following are macros that each numa implmentation must define. */ @@ -86,21 +84,6 @@ static inline int pfn_to_nid(unsigned long pfn) /* XXX: FIXME -- wli */ #define kern_addr_valid(kaddr) (0) -#define pfn_to_page(pfn) \ -({ \ - unsigned long __pfn = pfn; \ - int __node = pfn_to_nid(__pfn); \ - &NODE_DATA(__node)->node_mem_map[node_localnr(__pfn,__node)]; \ -}) - -#define page_to_pfn(pg) \ -({ \ - struct page *__page = pg; \ - struct zone *__zone = page_zone(__page); \ - (unsigned long)(__page - __zone->zone_mem_map) \ - + __zone->zone_start_pfn; \ -}) - #ifdef CONFIG_X86_NUMAQ /* we have contiguous memory on NUMA-Q */ #define pfn_valid(pfn) ((pfn) < num_physpages) #else diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h index 997ca5d17876..30f52a2263ba 100644 --- a/include/asm-i386/page.h +++ b/include/asm-i386/page.h @@ -126,8 +126,6 @@ extern int page_is_ram(unsigned long pagenr); #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) #ifdef CONFIG_FLATMEM -#define pfn_to_page(pfn) (mem_map + (pfn)) -#define page_to_pfn(page) ((unsigned long)((page) - mem_map)) #define pfn_valid(pfn) ((pfn) < max_mapnr) #endif /* CONFIG_FLATMEM */ #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) @@ -141,6 +139,7 @@ extern int page_is_ram(unsigned long pagenr); #endif /* __KERNEL__ */ +#include #include #endif /* _I386_PAGE_H */ From dc8ecb43701a78bd3c38e7fed1d1c76840579450 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:34 -0800 Subject: [PATCH 090/306] [PATCH] unify pfn_to_page: x86_64 pfn_to_page x86_64 can use generic funcs. For DISCONTIGMEM, CONFIG_OUT_OF_LINE_PFN_TO_PAGE is selected. Signed-off-by: KAMEZAWA Hiroyuki Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/Kconfig | 4 ++++ arch/x86_64/mm/numa.c | 15 --------------- include/asm-x86_64/mmzone.h | 4 ---- include/asm-x86_64/page.h | 3 +-- 4 files changed, 5 insertions(+), 21 deletions(-) diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 1cb4aa241c8c..4310b4a311a5 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -334,6 +334,10 @@ config HAVE_ARCH_EARLY_PFN_TO_NID def_bool y depends on NUMA +config OUT_OF_LINE_PFN_TO_PAGE + def_bool y + depends on DISCONTIGMEM + config NR_CPUS int "Maximum number of CPUs (2-256)" range 2 255 diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 63c72641b737..4be82d6e2b48 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -377,21 +377,6 @@ EXPORT_SYMBOL(node_data); * Should do that. */ -/* Requires pfn_valid(pfn) to be true */ -struct page *pfn_to_page(unsigned long pfn) -{ - int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT); - return (pfn - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map; -} -EXPORT_SYMBOL(pfn_to_page); - -unsigned long page_to_pfn(struct page *page) -{ - return (long)(((page) - page_zone(page)->zone_mem_map) + - page_zone(page)->zone_start_pfn); -} -EXPORT_SYMBOL(page_to_pfn); - int pfn_valid(unsigned long pfn) { unsigned nid; diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h index 937f99b26883..6b18cd8f293d 100644 --- a/include/asm-x86_64/mmzone.h +++ b/include/asm-x86_64/mmzone.h @@ -44,12 +44,8 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) #define pfn_to_nid(pfn) phys_to_nid((unsigned long)(pfn) << PAGE_SHIFT) #define kvaddr_to_nid(kaddr) phys_to_nid(__pa(kaddr)) -extern struct page *pfn_to_page(unsigned long pfn); -extern unsigned long page_to_pfn(struct page *page); extern int pfn_valid(unsigned long pfn); #endif -#define local_mapnr(kvaddr) \ - ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) ) #endif #endif diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h index 615e3e494929..408185bac351 100644 --- a/include/asm-x86_64/page.h +++ b/include/asm-x86_64/page.h @@ -123,8 +123,6 @@ typedef struct { unsigned long pgprot; } pgprot_t; #define __boot_va(x) __va(x) #define __boot_pa(x) __pa(x) #ifdef CONFIG_FLATMEM -#define pfn_to_page(pfn) (mem_map + (pfn)) -#define page_to_pfn(page) ((unsigned long)((page) - mem_map)) #define pfn_valid(pfn) ((pfn) < end_pfn) #endif @@ -140,6 +138,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; #endif /* __KERNEL__ */ +#include #include #endif /* _X86_64_PAGE_H */ From 659e35051b165fb40f1160190c2d23f3d5c319d5 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:35 -0800 Subject: [PATCH 091/306] [PATCH] unify pfn_to_page: powerpc pfn_to_page PowerPC can use generic ones. Signed-off-by: KAMEZAWA Hiroyuki Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-powerpc/page.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/asm-powerpc/page.h b/include/asm-powerpc/page.h index 0b82df483f7f..2fbecebe1c92 100644 --- a/include/asm-powerpc/page.h +++ b/include/asm-powerpc/page.h @@ -69,8 +69,6 @@ #endif #ifdef CONFIG_FLATMEM -#define pfn_to_page(pfn) (mem_map + (pfn)) -#define page_to_pfn(page) ((unsigned long)((page) - mem_map)) #define pfn_valid(pfn) ((pfn) < max_mapnr) #endif @@ -200,6 +198,7 @@ extern void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *p); extern int page_is_ram(unsigned long pfn); +#include #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ From 1c05dda2b6f025267ab79a267e0a84628a3760e1 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:36 -0800 Subject: [PATCH 092/306] [PATCH] unify pfn_to_page: alpha pfn_to_page Alpha can use generic funcs. Signed-off-by: KAMEZAWA Hiroyuki Cc: Richard Henderson Cc: Ivan Kokshaysky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-alpha/mmzone.h | 16 +--------------- include/asm-alpha/page.h | 4 +--- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/include/asm-alpha/mmzone.h b/include/asm-alpha/mmzone.h index a011ef4cf3d3..c9004398f273 100644 --- a/include/asm-alpha/mmzone.h +++ b/include/asm-alpha/mmzone.h @@ -59,9 +59,6 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n) #define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr)) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) -#define local_mapnr(kvaddr) \ - ((__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr))) - /* * Given a kaddr, LOCAL_BASE_ADDR finds the owning node of the memory * and returns the kaddr corresponding to first physical page in the @@ -104,19 +101,8 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n) __xx; \ }) -#define pfn_to_page(pfn) \ -({ \ - unsigned long kaddr = (unsigned long)__va((pfn) << PAGE_SHIFT); \ - (NODE_DATA(kvaddr_to_nid(kaddr))->node_mem_map + local_mapnr(kaddr)); \ -}) - -#define page_to_pfn(page) \ - ((page) - page_zone(page)->zone_mem_map + \ - (page_zone(page)->zone_start_pfn)) - #define page_to_pa(page) \ - ((( (page) - page_zone(page)->zone_mem_map ) \ - + page_zone(page)->zone_start_pfn) << PAGE_SHIFT) + (page_to_pfn(page) << PAGE_SHIFT) #define pfn_to_nid(pfn) pa_to_nid(((u64)(pfn) << PAGE_SHIFT)) #define pfn_valid(pfn) \ diff --git a/include/asm-alpha/page.h b/include/asm-alpha/page.h index fa0b41b164a7..61bcf70b5eac 100644 --- a/include/asm-alpha/page.h +++ b/include/asm-alpha/page.h @@ -85,8 +85,6 @@ typedef unsigned long pgprot_t; #define __pa(x) ((unsigned long) (x) - PAGE_OFFSET) #define __va(x) ((void *)((unsigned long) (x) + PAGE_OFFSET)) #ifndef CONFIG_DISCONTIGMEM -#define pfn_to_page(pfn) (mem_map + (pfn)) -#define page_to_pfn(page) ((unsigned long)((page) - mem_map)) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define pfn_valid(pfn) ((pfn) < max_mapnr) @@ -95,9 +93,9 @@ typedef unsigned long pgprot_t; #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) - #endif /* __KERNEL__ */ +#include #include #endif /* _ALPHA_PAGE_H */ From 7eb98a2f3b605d8a11434d855b58d828393ea533 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:37 -0800 Subject: [PATCH 093/306] [PATCH] unify pfn_to_page: arm pfn_to_page ARM can use generic funcs. PFN_TO_NID, LOCAL_MAP_NR are defined by sub-archs. Signed-off-by: KAMEZAWA Hirotuki Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-arm/memory.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/include/asm-arm/memory.h b/include/asm-arm/memory.h index b4e1146ab682..afa5c3ea077c 100644 --- a/include/asm-arm/memory.h +++ b/include/asm-arm/memory.h @@ -172,9 +172,7 @@ static inline __deprecated void *bus_to_virt(unsigned long x) * virt_addr_valid(k) indicates whether a virtual address is valid */ #ifndef CONFIG_DISCONTIGMEM - -#define page_to_pfn(page) (((page) - mem_map) + PHYS_PFN_OFFSET) -#define pfn_to_page(pfn) ((mem_map + (pfn)) - PHYS_PFN_OFFSET) +#define ARCH_PFN_OFFSET (PHYS_PFN_OFFSET) #define pfn_valid(pfn) ((pfn) >= PHYS_PFN_OFFSET && (pfn) < (PHYS_PFN_OFFSET + max_mapnr)) #define virt_to_page(kaddr) (pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)) @@ -189,13 +187,8 @@ static inline __deprecated void *bus_to_virt(unsigned long x) * around in memory. */ #include - -#define page_to_pfn(page) \ - (( (page) - page_zone(page)->zone_mem_map) \ - + page_zone(page)->zone_start_pfn) - -#define pfn_to_page(pfn) \ - (PFN_TO_MAPBASE(pfn) + LOCAL_MAP_NR((pfn) << PAGE_SHIFT)) +#define arch_pfn_to_nid(pfn) (PFN_TO_NID(pfn)) +#define arch_local_page_offset(pfn, nid) (LOCAL_MAP_NR((pfn) << PAGE_OFFSET)) #define pfn_valid(pfn) \ ({ \ @@ -243,4 +236,6 @@ static inline __deprecated void *bus_to_virt(unsigned long x) #endif +#include + #endif From 89fccaf2c4a4a8318153e3460a92284dadeb8f71 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:38 -0800 Subject: [PATCH 094/306] [PATCH] unify pfn_to_page: arm26 pfn_to_page arm26 can use generic funcs. Signed-off-by: KAMEZAWA Hiroyuki Cc: Ian Molton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-arm26/memory.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/asm-arm26/memory.h b/include/asm-arm26/memory.h index 20d78616f650..a65f10b80dfb 100644 --- a/include/asm-arm26/memory.h +++ b/include/asm-arm26/memory.h @@ -81,8 +81,7 @@ static inline void *phys_to_virt(unsigned long x) * virt_to_page(k) convert a _valid_ virtual address to struct page * * virt_addr_valid(k) indicates whether a virtual address is valid */ -#define page_to_pfn(page) (((page) - mem_map) + PHYS_PFN_OFFSET) -#define pfn_to_page(pfn) ((mem_map + (pfn)) - PHYS_PFN_OFFSET) +#define ARCH_PFN_OFFSET (PHYS_PFN_OFFSET) #define pfn_valid(pfn) ((pfn) >= PHYS_PFN_OFFSET && (pfn) < (PHYS_PFN_OFFSET + max_mapnr)) #define virt_to_page(kaddr) (pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)) @@ -98,4 +97,5 @@ static inline void *phys_to_virt(unsigned long x) */ #define page_to_bus(page) (page_address(page)) +#include #endif From bb872f787a9b6a40a4aa206175e768137f595d9c Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:39 -0800 Subject: [PATCH 095/306] [PATCH] unify pfn_to_page: cris pfn_to_page cris can use generic funcs. Signed-off-by: KAMEZAWA Hiroyuki Cc: Mikael Starvik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-cris/page.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/asm-cris/page.h b/include/asm-cris/page.h index c99c478c482f..3787633e6209 100644 --- a/include/asm-cris/page.h +++ b/include/asm-cris/page.h @@ -43,8 +43,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; /* On CRIS the PFN numbers doesn't start at 0 so we have to compensate */ /* for that before indexing into the page table starting at mem_map */ -#define pfn_to_page(pfn) (mem_map + ((pfn) - (PAGE_OFFSET >> PAGE_SHIFT))) -#define page_to_pfn(page) ((unsigned long)((page) - mem_map) + (PAGE_OFFSET >> PAGE_SHIFT)) +#define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) #define pfn_valid(pfn) (((pfn) - (PAGE_OFFSET >> PAGE_SHIFT)) < max_mapnr) /* to index into the page map. our pages all start at physical addr PAGE_OFFSET so @@ -77,6 +76,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; #endif /* __KERNEL__ */ +#include #include #endif /* _CRIS_PAGE_H */ From 5cdac7ca1b727184b1af21effaffcb9e2cd535c2 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:40 -0800 Subject: [PATCH 096/306] [PATCH] unify pfn_to_page: FRV pfn_to_page FRV can use generic funcs. Signed-off-by: KAMEZAWA Hiroyuki Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-frv/page.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/include/asm-frv/page.h b/include/asm-frv/page.h index b8221b611b5c..dc0f7e08a4c2 100644 --- a/include/asm-frv/page.h +++ b/include/asm-frv/page.h @@ -57,13 +57,9 @@ extern unsigned long min_low_pfn; extern unsigned long max_pfn; #ifdef CONFIG_MMU -#define pfn_to_page(pfn) (mem_map + (pfn)) -#define page_to_pfn(page) ((unsigned long) ((page) - mem_map)) #define pfn_valid(pfn) ((pfn) < max_mapnr) - #else -#define pfn_to_page(pfn) (&mem_map[(pfn) - (PAGE_OFFSET >> PAGE_SHIFT)]) -#define page_to_pfn(page) ((PAGE_OFFSET >> PAGE_SHIFT) + (unsigned long) ((page) - mem_map)) +#define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) #define pfn_valid(pfn) ((pfn) >= min_low_pfn && (pfn) < max_low_pfn) #endif @@ -87,6 +83,7 @@ extern unsigned long max_pfn; #define WANT_PAGE_VIRTUAL 1 #endif +#include #include #endif /* _ASM_PAGE_H */ From dd6cc7631cf2a63b3d6c32619054e017479e72ca Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:41 -0800 Subject: [PATCH 097/306] [PATCH] unify pfn_to_page: h8300 pfn_to_page H8300 can use generic funcs. Signed-off-by: KAMEZAWA Hiroyuki Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-h8300/page.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/asm-h8300/page.h b/include/asm-h8300/page.h index cd35b1cc6cde..6472c9f88227 100644 --- a/include/asm-h8300/page.h +++ b/include/asm-h8300/page.h @@ -71,8 +71,7 @@ extern unsigned long memory_end; #define page_to_virt(page) ((((page) - mem_map) << PAGE_SHIFT) + PAGE_OFFSET) #define pfn_valid(page) (page < max_mapnr) -#define pfn_to_page(pfn) virt_to_page(pfn_to_virt(pfn)) -#define page_to_pfn(page) virt_to_pfn(page_to_virt(page)) +#define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) #define virt_addr_valid(kaddr) (((void *)(kaddr) >= (void *)PAGE_OFFSET) && \ ((void *)(kaddr) < (void *)memory_end)) @@ -81,6 +80,7 @@ extern unsigned long memory_end; #endif /* __KERNEL__ */ +#include #include #endif /* _H8300_PAGE_H */ From 7126cffe74f0dbcd015aaabab01069f422526535 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:42 -0800 Subject: [PATCH 098/306] [PATCH] unify pfn_to_page: m32r pfn_to_page m32r can use generic funcs. Signed-off-by: KAMEZAWA Hiroyuki Cc: Hirokazu Takata Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-m32r/mmzone.h | 14 -------------- include/asm-m32r/page.h | 5 ++--- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/include/asm-m32r/mmzone.h b/include/asm-m32r/mmzone.h index adc7970a77ec..9f3b5accda88 100644 --- a/include/asm-m32r/mmzone.h +++ b/include/asm-m32r/mmzone.h @@ -21,20 +21,6 @@ extern struct pglist_data *node_data[]; __pgdat->node_start_pfn + __pgdat->node_spanned_pages - 1; \ }) -#define pfn_to_page(pfn) \ -({ \ - unsigned long __pfn = pfn; \ - int __node = pfn_to_nid(__pfn); \ - &NODE_DATA(__node)->node_mem_map[node_localnr(__pfn,__node)]; \ -}) - -#define page_to_pfn(pg) \ -({ \ - struct page *__page = pg; \ - struct zone *__zone = page_zone(__page); \ - (unsigned long)(__page - __zone->zone_mem_map) \ - + __zone->zone_start_pfn; \ -}) #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) /* * pfn_valid should be made as fast as possible, and the current definition diff --git a/include/asm-m32r/page.h b/include/asm-m32r/page.h index 4ab578876361..9ddbc087dbc5 100644 --- a/include/asm-m32r/page.h +++ b/include/asm-m32r/page.h @@ -76,9 +76,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; #ifndef CONFIG_DISCONTIGMEM #define PFN_BASE (CONFIG_MEMORY_START >> PAGE_SHIFT) -#define pfn_to_page(pfn) (mem_map + ((pfn) - PFN_BASE)) -#define page_to_pfn(page) \ - ((unsigned long)((page) - mem_map) + PFN_BASE) +#define ARCH_PFN_OFFSET PFN_BASE #define pfn_valid(pfn) (((pfn) - PFN_BASE) < max_mapnr) #endif /* !CONFIG_DISCONTIGMEM */ @@ -92,6 +90,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; #endif /* __KERNEL__ */ +#include #include #endif /* _ASM_M32R_PAGE_H */ From a02036e796e5046fe0463b9a092e9b617c525866 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:43 -0800 Subject: [PATCH 099/306] [PATCH] unify pfn_to_page: mips pfn_to_page MIPS can use generic funcs. Signed-off-by: KAMEZAWA Hiroyuki Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-mips/mmzone.h | 14 -------------- include/asm-mips/page.h | 3 +-- 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/include/asm-mips/mmzone.h b/include/asm-mips/mmzone.h index 011caebac369..7bde4432092b 100644 --- a/include/asm-mips/mmzone.h +++ b/include/asm-mips/mmzone.h @@ -22,20 +22,6 @@ NODE_DATA(__n)->node_spanned_pages) : 0);\ }) -#define pfn_to_page(pfn) \ -({ \ - unsigned long __pfn = (pfn); \ - pg_data_t *__pg = NODE_DATA(pfn_to_nid(__pfn)); \ - __pg->node_mem_map + (__pfn - __pg->node_start_pfn); \ -}) - -#define page_to_pfn(p) \ -({ \ - struct page *__p = (p); \ - struct zone *__z = page_zone(__p); \ - ((__p - __z->zone_mem_map) + __z->zone_start_pfn); \ -}) - /* XXX: FIXME -- wli */ #define kern_addr_valid(addr) (0) diff --git a/include/asm-mips/page.h b/include/asm-mips/page.h index ee25a779bf49..a1eab136ff6c 100644 --- a/include/asm-mips/page.h +++ b/include/asm-mips/page.h @@ -140,8 +140,6 @@ typedef struct { unsigned long pgprot; } pgprot_t; #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) #ifndef CONFIG_NEED_MULTIPLE_NODES -#define pfn_to_page(pfn) (mem_map + (pfn)) -#define page_to_pfn(page) ((unsigned long)((page) - mem_map)) #define pfn_valid(pfn) ((pfn) < max_mapnr) #endif @@ -160,6 +158,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; #define WANT_PAGE_VIRTUAL #endif +#include #include #endif /* _ASM_PAGE_H */ From 0d833b41092df2f4a65cbdb0a0a947c35cdd2f9d Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:43 -0800 Subject: [PATCH 100/306] [PATCH] unify pfn_to_page: parisc pfn_to_page PARISC can use generic funcs. Signed-off-by: KAMEZAWA Hiroyuki Cc: Kyle McMartin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-parisc/mmzone.h | 17 ----------------- include/asm-parisc/page.h | 3 +-- 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/include/asm-parisc/mmzone.h b/include/asm-parisc/mmzone.h index ae039f4fd711..ceb9b73199d1 100644 --- a/include/asm-parisc/mmzone.h +++ b/include/asm-parisc/mmzone.h @@ -25,23 +25,6 @@ extern struct node_map_data node_data[]; pg_data_t *__pgdat = NODE_DATA(nid); \ __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \ }) -#define node_localnr(pfn, nid) ((pfn) - node_start_pfn(nid)) - -#define pfn_to_page(pfn) \ -({ \ - unsigned long __pfn = (pfn); \ - int __node = pfn_to_nid(__pfn); \ - &NODE_DATA(__node)->node_mem_map[node_localnr(__pfn,__node)]; \ -}) - -#define page_to_pfn(pg) \ -({ \ - struct page *__page = pg; \ - struct zone *__zone = page_zone(__page); \ - BUG_ON(__zone == NULL); \ - (unsigned long)(__page - __zone->zone_mem_map) \ - + __zone->zone_start_pfn; \ -}) /* We have these possible memory map layouts: * Astro: 0-3.75, 67.75-68, 4-64 diff --git a/include/asm-parisc/page.h b/include/asm-parisc/page.h index 4a6752b0afed..9f303c0c3cd7 100644 --- a/include/asm-parisc/page.h +++ b/include/asm-parisc/page.h @@ -130,8 +130,6 @@ extern int npmem_ranges; #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #ifndef CONFIG_DISCONTIGMEM -#define pfn_to_page(pfn) (mem_map + (pfn)) -#define page_to_pfn(page) ((unsigned long)((page) - mem_map)) #define pfn_valid(pfn) ((pfn) < max_mapnr) #endif /* CONFIG_DISCONTIGMEM */ @@ -152,6 +150,7 @@ extern int npmem_ranges; #endif /* __KERNEL__ */ +#include #include #endif /* _PARISC_PAGE_H */ From f68d4c996d3062c8fe64e3ed36d4c83d23288ad8 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:44 -0800 Subject: [PATCH 101/306] [PATCH] unify pfn_to_page: ppc pfn_to_page PPC can use generic funcs. Signed-off-by: KAMEZAWA Hiroyuki Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ppc/page.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/asm-ppc/page.h b/include/asm-ppc/page.h index 538e0c8ab243..a70ba2ee552d 100644 --- a/include/asm-ppc/page.h +++ b/include/asm-ppc/page.h @@ -149,8 +149,7 @@ extern int page_is_ram(unsigned long pfn); #define __pa(x) ___pa((unsigned long)(x)) #define __va(x) ((void *)(___va((unsigned long)(x)))) -#define pfn_to_page(pfn) (mem_map + ((pfn) - PPC_PGSTART)) -#define page_to_pfn(page) ((unsigned long)((page) - mem_map) + PPC_PGSTART) +#define ARCH_PFN_OFFSET (PPC_PGSTART) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define page_to_virt(page) __va(page_to_pfn(page) << PAGE_SHIFT) @@ -175,5 +174,6 @@ extern __inline__ int get_order(unsigned long size) /* We do define AT_SYSINFO_EHDR but don't use the gate mecanism */ #define __HAVE_ARCH_GATE_AREA 1 +#include #endif /* __KERNEL__ */ #endif /* _PPC_PAGE_H */ From aed630434c4fc0bca8ed62f6730d44ba00bdf595 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:45 -0800 Subject: [PATCH 102/306] [PATCH] unify pfn_to_page: s390 pfn_to_page s390 can use generic funcs. Signed-off-by: KAMEZAWA Hiroyuki Cc: Heiko Carstens Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-s390/page.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/asm-s390/page.h b/include/asm-s390/page.h index 2430c561e021..3b1138ac7e79 100644 --- a/include/asm-s390/page.h +++ b/include/asm-s390/page.h @@ -181,8 +181,6 @@ page_get_storage_key(unsigned long addr) #define PAGE_OFFSET 0x0UL #define __pa(x) (unsigned long)(x) #define __va(x) (void *)(unsigned long)(x) -#define pfn_to_page(pfn) (mem_map + (pfn)) -#define page_to_pfn(page) ((unsigned long)((page) - mem_map)) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define pfn_valid(pfn) ((pfn) < max_mapnr) @@ -193,6 +191,7 @@ page_get_storage_key(unsigned long addr) #endif /* __KERNEL__ */ +#include #include #endif /* _S390_PAGE_H */ From 104b8deaa5c0144cccfc7d914413ff80c7176af1 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:46 -0800 Subject: [PATCH 103/306] [PATCH] unify pfn_to_page: sh pfn_to_page sh can use generic funcs. Signed-off-by: KAMEZAWA Hiroyuki Cc: Paul Mundt Cc: Kazumoto Kojima Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-sh/page.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/asm-sh/page.h b/include/asm-sh/page.h index 972c3f655b2a..9c89287c3e56 100644 --- a/include/asm-sh/page.h +++ b/include/asm-sh/page.h @@ -105,9 +105,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; /* PFN start number, because of __MEMORY_START */ #define PFN_START (__MEMORY_START >> PAGE_SHIFT) - -#define pfn_to_page(pfn) (mem_map + (pfn) - PFN_START) -#define page_to_pfn(page) ((unsigned long)((page) - mem_map) + PFN_START) +#define ARCH_PFN_OFFSET (FPN_START) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define pfn_valid(pfn) (((pfn) - PFN_START) < max_mapnr) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) @@ -117,6 +115,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; #endif /* __KERNEL__ */ +#include #include #endif /* __ASM_SH_PAGE_H */ From 309d34bbe521940de9098d306a58a1797a42d990 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:47 -0800 Subject: [PATCH 104/306] [PATCH] unify pfn_to_page: sh64 pfn_to_page sh64 can use generic funcs. Signed-off-by: KAMEZAWA Hiroyuki Cc: Paul Mundt Cc: Richard Curnow Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-sh64/page.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/asm-sh64/page.h b/include/asm-sh64/page.h index c86df90f7cbd..e4937cdabebd 100644 --- a/include/asm-sh64/page.h +++ b/include/asm-sh64/page.h @@ -105,9 +105,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; /* PFN start number, because of __MEMORY_START */ #define PFN_START (__MEMORY_START >> PAGE_SHIFT) - -#define pfn_to_page(pfn) (mem_map + (pfn) - PFN_START) -#define page_to_pfn(page) ((unsigned long)((page) - mem_map) + PFN_START) +#define ARCH_PFN_OFFSET (PFN_START) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define pfn_valid(pfn) (((pfn) - PFN_START) < max_mapnr) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) @@ -117,6 +115,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; #endif /* __KERNEL__ */ +#include #include #endif /* __ASM_SH64_PAGE_H */ From 064b2a0762ef7dca5f7068a90b260a309ffe2002 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:48 -0800 Subject: [PATCH 105/306] [PATCH] unify pfn_to_page: sparc pfn_to_page sparc can use generic funcs. Signed-off-by: KAMEZAWA Hiroyuki Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-sparc/page.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/asm-sparc/page.h b/include/asm-sparc/page.h index 9122684f6c1e..ec3274b7ddf4 100644 --- a/include/asm-sparc/page.h +++ b/include/asm-sparc/page.h @@ -152,8 +152,7 @@ extern unsigned long pfn_base; #define virt_to_phys __pa #define phys_to_virt __va -#define pfn_to_page(pfn) (mem_map + ((pfn)-(pfn_base))) -#define page_to_pfn(page) ((unsigned long)(((page) - mem_map) + pfn_base)) +#define ARCH_PFN_OFFSET (pfn_base) #define virt_to_page(kaddr) (mem_map + ((((unsigned long)(kaddr)-PAGE_OFFSET)>>PAGE_SHIFT))) #define pfn_valid(pfn) (((pfn) >= (pfn_base)) && (((pfn)-(pfn_base)) < max_mapnr)) @@ -164,6 +163,7 @@ extern unsigned long pfn_base; #endif /* __KERNEL__ */ +#include #include #endif /* _SPARC_PAGE_H */ From 9828c185db7ab41426c03c5539b1759ccf3c28ed Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:50 -0800 Subject: [PATCH 106/306] [PATCH] unify pfn_to_page: uml pfn_to_page UML can use generic funcs. Signed-off-by: KAMEZAWA Hiroyuki Cc: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-um/page.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/asm-um/page.h b/include/asm-um/page.h index 0229814af31e..41364330aff1 100644 --- a/include/asm-um/page.h +++ b/include/asm-um/page.h @@ -106,9 +106,6 @@ extern unsigned long uml_physmem; #define __pa(virt) to_phys((void *) (unsigned long) (virt)) #define __va(phys) to_virt((unsigned long) (phys)) -#define page_to_pfn(page) ((page) - mem_map) -#define pfn_to_page(pfn) (mem_map + (pfn)) - #define phys_to_pfn(p) ((p) >> PAGE_SHIFT) #define pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT) @@ -121,6 +118,7 @@ extern struct page *arch_validate(struct page *page, gfp_t mask, int order); extern void arch_free_page(struct page *page, int order); #define HAVE_ARCH_FREE_PAGE +#include #include #endif From e6009f1ba5f6ff0fc8723254ce89ef9c979cc370 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:52 -0800 Subject: [PATCH 107/306] [PATCH] unify pfn_to_page: v850 pfn_to_page v850 can use generic funcs. Signed-off-by: KAMEZAWA Hiroyuki Cc: Miles Bader Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-v850/page.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/asm-v850/page.h b/include/asm-v850/page.h index b4bc85e7b91a..ad03c46a1f92 100644 --- a/include/asm-v850/page.h +++ b/include/asm-v850/page.h @@ -111,8 +111,7 @@ typedef unsigned long pgprot_t; #define page_to_virt(page) \ ((((page) - mem_map) << PAGE_SHIFT) + PAGE_OFFSET) -#define pfn_to_page(pfn) virt_to_page (pfn_to_virt (pfn)) -#define page_to_pfn(page) virt_to_pfn (page_to_virt (page)) +#define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) #define pfn_valid(pfn) ((pfn) < max_mapnr) #define virt_addr_valid(kaddr) \ @@ -125,6 +124,7 @@ typedef unsigned long pgprot_t; #endif /* KERNEL */ +#include #include #endif /* __V850_PAGE_H__ */ From 655a0443470a73d5dc36e974a241e8db59bb1ccb Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:52 -0800 Subject: [PATCH 108/306] [PATCH] unify pfn_to_page: xtensa pfn_to_page xtensa can use generic funcs. Signed-off-by: KAMEZAWA Hiroyuki Cc: Chris Zankel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-xtensa/page.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/asm-xtensa/page.h b/include/asm-xtensa/page.h index 8ded36f255a2..992bac5c1258 100644 --- a/include/asm-xtensa/page.h +++ b/include/asm-xtensa/page.h @@ -109,10 +109,7 @@ void copy_user_page(void *to,void* from,unsigned long vaddr,struct page* page); #define __pa(x) ((unsigned long) (x) - PAGE_OFFSET) #define __va(x) ((void *)((unsigned long) (x) + PAGE_OFFSET)) #define pfn_valid(pfn) ((unsigned long)pfn < max_mapnr) -#ifndef CONFIG_DISCONTIGMEM -# define pfn_to_page(pfn) (mem_map + (pfn)) -# define page_to_pfn(page) ((unsigned long)((page) - mem_map)) -#else +#ifdef CONFIG_DISCONTIGMEM # error CONFIG_DISCONTIGMEM not supported #endif @@ -130,4 +127,5 @@ void copy_user_page(void *to,void* from,unsigned long vaddr,struct page* page); VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #endif /* __KERNEL__ */ +#include #endif /* _XTENSA_PAGE_H */ From 0ecd702bcb924d5fb7f687e09986f688336ac896 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:53 -0800 Subject: [PATCH 109/306] [PATCH] unify pfn_to_page: ia64 pfn_to_page ia64 has special config CONFIG_VIRTUAL_MEM_MAP. CONFIG_DISCONTIGMEM=y && CONFIG_VIRTUAL_MEM_MAP!=y is bug ? Signed-off-by: KAMEZAWA Hiroyuki Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ia64/page.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/include/asm-ia64/page.h b/include/asm-ia64/page.h index 6e9aa23250c4..2087825eefa4 100644 --- a/include/asm-ia64/page.h +++ b/include/asm-ia64/page.h @@ -106,17 +106,25 @@ extern int ia64_pfn_valid (unsigned long pfn); # define ia64_pfn_valid(pfn) 1 #endif +#ifdef CONFIG_VIRTUAL_MEM_MAP +extern struct page *vmem_map; +#ifdef CONFIG_DISCONTIGMEM +# define page_to_pfn(page) ((unsigned long) (page - vmem_map)) +# define pfn_to_page(pfn) (vmem_map + (pfn)) +#endif +#endif + +#if defined(CONFIG_FLATMEM) || defined(CONFIG_SPARSEMEM) +/* FLATMEM always configures mem_map (mem_map = vmem_map if necessary) */ +#include +#endif + #ifdef CONFIG_FLATMEM # define pfn_valid(pfn) (((pfn) < max_mapnr) && ia64_pfn_valid(pfn)) -# define page_to_pfn(page) ((unsigned long) (page - mem_map)) -# define pfn_to_page(pfn) (mem_map + (pfn)) #elif defined(CONFIG_DISCONTIGMEM) -extern struct page *vmem_map; extern unsigned long min_low_pfn; extern unsigned long max_low_pfn; # define pfn_valid(pfn) (((pfn) >= min_low_pfn) && ((pfn) < max_low_pfn) && ia64_pfn_valid(pfn)) -# define page_to_pfn(page) ((unsigned long) (page - vmem_map)) -# define pfn_to_page(pfn) (vmem_map + (pfn)) #endif #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) From a0140c1d85637ee5f4ea7c78f066e3611a6a79dc Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:55 -0800 Subject: [PATCH 110/306] [PATCH] remove zone_mem_map This patch removes zone_mem_map. pfn_to_page uses pgdat, page_to_pfn uses zone. page_to_pfn can use pgdat instead of zone, which is only one user of zone_mem_map. By modifing it, we can remove zone_mem_map. Signed-off-by: KAMEZAWA Hiroyuki Cc: Dave Hansen Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-alpha/mmzone.h | 3 +-- include/asm-generic/memory_model.h | 10 +++++----- include/linux/mmzone.h | 1 - mm/page_alloc.c | 6 ++---- 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/include/asm-alpha/mmzone.h b/include/asm-alpha/mmzone.h index c9004398f273..192d80c875b0 100644 --- a/include/asm-alpha/mmzone.h +++ b/include/asm-alpha/mmzone.h @@ -83,8 +83,7 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n) pte_t pte; \ unsigned long pfn; \ \ - pfn = ((unsigned long)((page)-page_zone(page)->zone_mem_map)) << 32; \ - pfn += page_zone(page)->zone_start_pfn << 32; \ + pfn = page_to_pfn(page) << 32; \ pte_val(pte) = pfn | pgprot_val(pgprot); \ \ pte; \ diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index a7bb4978e808..0cfb086dd373 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -45,11 +45,11 @@ extern unsigned long page_to_pfn(struct page *page); NODE_DATA(__nid)->node_mem_map + arch_local_page_offset(__pfn, __nid);\ }) -#define page_to_pfn(pg) \ -({ struct page *__pg = (pg); \ - struct zone *__zone = page_zone(__pg); \ - (unsigned long)(__pg - __zone->zone_mem_map) + \ - __zone->zone_start_pfn; \ +#define page_to_pfn(pg) \ +({ struct page *__pg = (pg); \ + struct pglist_data *__pgdat = NODE_DATA(page_to_nid(__pg)); \ + (unsigned long)(__pg - __pgdat->node_mem_map) + \ + __pgdat->node_start_pfn; \ }) #elif defined(CONFIG_SPARSEMEM) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0c1c0c0cce65..ace31c515a8c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -225,7 +225,6 @@ struct zone { * Discontig memory support fields. */ struct pglist_data *zone_pgdat; - struct page *zone_mem_map; /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 349b328763b7..8dc8f2735d22 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2042,7 +2042,6 @@ static __meminit void init_currently_empty_zone(struct zone *zone, zone_wait_table_init(zone, size); pgdat->nr_zones = zone_idx(zone) + 1; - zone->zone_mem_map = pfn_to_page(zone_start_pfn); zone->zone_start_pfn = zone_start_pfn; memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); @@ -2768,9 +2767,8 @@ struct page *pfn_to_page(unsigned long pfn) } unsigned long page_to_pfn(struct page *page) { - struct zone *zone = page_zone(page); - return (page - zone->zone_mem_map) + zone->zone_start_pfn; - + struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + return (page - pgdat->node_mem_map) + pgdat->node_start_pfn; } #elif defined(CONFIG_SPARSEMEM) struct page *pfn_to_page(unsigned long pfn) From 8357f8695d58b50fbf2bd507b4b0fc2cd1e43bd6 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:57 -0800 Subject: [PATCH 111/306] [PATCH] define for_each_online_pgdat This patch defines for_each_online_pgdat() as a replacement of for_each_pgdat() Now, online nodes are managed by node_online_map. But for_each_pgdat() uses pgdat_link to iterate over all nodes(pgdat). This means management structure for online pgdat is duplicated. I think using node_online_map for for_each_pgdat() is simple and sane rather ather than pgdat_link. New macro is named as for_each_online_pgdat(). Following patch will fix callers of for_each_pgdat(). The bootmem allocater uses for_each_pgdat() before pgdat initialization. I don't think it's sane. Following patch will fix it. Signed-off-by: Yasunori Goto Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 108 +++++++++++++++++++++------------------ include/linux/nodemask.h | 4 ++ 2 files changed, 61 insertions(+), 51 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ace31c515a8c..96eb08025092 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -13,6 +13,7 @@ #include #include #include +#include #include /* Free memory management - zoned buddy allocator. */ @@ -349,57 +350,6 @@ unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); */ #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) -/** - * for_each_pgdat - helper macro to iterate over all nodes - * @pgdat - pointer to a pg_data_t variable - * - * Meant to help with common loops of the form - * pgdat = pgdat_list; - * while(pgdat) { - * ... - * pgdat = pgdat->pgdat_next; - * } - */ -#define for_each_pgdat(pgdat) \ - for (pgdat = pgdat_list; pgdat; pgdat = pgdat->pgdat_next) - -/* - * next_zone - helper magic for for_each_zone() - * Thanks to William Lee Irwin III for this piece of ingenuity. - */ -static inline struct zone *next_zone(struct zone *zone) -{ - pg_data_t *pgdat = zone->zone_pgdat; - - if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) - zone++; - else if (pgdat->pgdat_next) { - pgdat = pgdat->pgdat_next; - zone = pgdat->node_zones; - } else - zone = NULL; - - return zone; -} - -/** - * for_each_zone - helper macro to iterate over all memory zones - * @zone - pointer to struct zone variable - * - * The user only needs to declare the zone variable, for_each_zone - * fills it in. This basically means for_each_zone() is an - * easier to read version of this piece of code: - * - * for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next) - * for (i = 0; i < MAX_NR_ZONES; ++i) { - * struct zone * z = pgdat->node_zones + i; - * ... - * } - * } - */ -#define for_each_zone(zone) \ - for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) - static inline int populated_zone(struct zone *zone) { return (!!zone->present_pages); @@ -471,6 +421,62 @@ extern struct pglist_data contig_page_data; #endif /* !CONFIG_NEED_MULTIPLE_NODES */ +static inline struct pglist_data *first_online_pgdat(void) +{ + return NODE_DATA(first_online_node); +} + +static inline struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) +{ + int nid = next_online_node(pgdat->node_id); + + if (nid == MAX_NUMNODES) + return NULL; + return NODE_DATA(nid); +} + + +/** + * for_each_pgdat - helper macro to iterate over all nodes + * @pgdat - pointer to a pg_data_t variable + */ +#define for_each_online_pgdat(pgdat) \ + for (pgdat = first_online_pgdat(); \ + pgdat; \ + pgdat = next_online_pgdat(pgdat)) + +/* + * next_zone - helper magic for for_each_zone() + * Thanks to William Lee Irwin III for this piece of ingenuity. + */ +static inline struct zone *next_zone(struct zone *zone) +{ + pg_data_t *pgdat = zone->zone_pgdat; + + if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) + zone++; + else { + pgdat = next_online_pgdat(pgdat); + if (pgdat) + zone = pgdat->node_zones; + else + zone = NULL; + } + return zone; +} + +/** + * for_each_zone - helper macro to iterate over all memory zones + * @zone - pointer to struct zone variable + * + * The user only needs to declare the zone variable, for_each_zone + * fills it in. + */ +#define for_each_zone(zone) \ + for (zone = (first_online_pgdat())->node_zones; \ + zone; \ + zone = next_zone(zone)) + #ifdef CONFIG_SPARSEMEM #include #endif diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index b959a4525cbd..1a9ef3e627d1 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -350,11 +350,15 @@ extern nodemask_t node_possible_map; #define num_possible_nodes() nodes_weight(node_possible_map) #define node_online(node) node_isset((node), node_online_map) #define node_possible(node) node_isset((node), node_possible_map) +#define first_online_node first_node(node_online_map) +#define next_online_node(nid) next_node((nid), node_online_map) #else #define num_online_nodes() 1 #define num_possible_nodes() 1 #define node_online(node) ((node) == 0) #define node_possible(node) ((node) == 0) +#define first_online_node 0 +#define next_online_node(nid) (MAX_NUMNODES) #endif #define any_online_node(mask) \ From 679bc9fbb508a0aac9539b2de747eb5849feb428 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:58 -0800 Subject: [PATCH 112/306] [PATCH] for_each_online_pgdat: for_each_bootmem Add a list_head to bootmem_data_t and make bootmems use it. bootmem list is sorted by node_boot_start. Only nodes against which init_bootmem() is called are linked to the list. (i386 allocates bootmem only from one node(0) not from all online nodes.) A summary: 1. for_each_online_pgdat() traverses all *online* nodes. 2. alloc_bootmem() allocates memory only from initialized-for-bootmem nodes. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 1 + mm/bootmem.c | 39 +++++++++++++++++++++++++++++---------- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 7155452fb4a8..de3eb8d8ae26 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -38,6 +38,7 @@ typedef struct bootmem_data { unsigned long last_pos; unsigned long last_success; /* Previous allocation point. To speed * up searching */ + struct list_head list; } bootmem_data_t; extern unsigned long __init bootmem_bootmap_pages (unsigned long); diff --git a/mm/bootmem.c b/mm/bootmem.c index b55bd39fc5dd..d3e3bd2ffcea 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -33,6 +33,7 @@ EXPORT_SYMBOL(max_pfn); /* This is exported so * dma_get_required_mask(), which uses * it, can be an inline function */ +static LIST_HEAD(bdata_list); #ifdef CONFIG_CRASH_DUMP /* * If we have booted due to a crash, max_pfn will be a very low value. We need @@ -52,6 +53,27 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages) return mapsize; } +/* + * link bdata in order + */ +static void link_bootmem(bootmem_data_t *bdata) +{ + bootmem_data_t *ent; + if (list_empty(&bdata_list)) { + list_add(&bdata->list, &bdata_list); + return; + } + /* insert in order */ + list_for_each_entry(ent, &bdata_list, list) { + if (bdata->node_boot_start < ent->node_boot_start) { + list_add_tail(&bdata->list, &ent->list); + return; + } + } + list_add_tail(&bdata->list, &bdata_list); + return; +} + /* * Called once to set up the allocator itself. @@ -62,13 +84,11 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat, bootmem_data_t *bdata = pgdat->bdata; unsigned long mapsize = ((end - start)+7)/8; - pgdat->pgdat_next = pgdat_list; - pgdat_list = pgdat; - mapsize = ALIGN(mapsize, sizeof(long)); bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); bdata->node_boot_start = (start << PAGE_SHIFT); bdata->node_low_pfn = end; + link_bootmem(bdata); /* * Initially all pages are reserved - setup_arch() has to @@ -383,12 +403,11 @@ unsigned long __init free_all_bootmem (void) void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) { - pg_data_t *pgdat = pgdat_list; + bootmem_data_t *bdata; void *ptr; - for_each_pgdat(pgdat) - if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, - align, goal, 0))) + list_for_each_entry(bdata, &bdata_list, list) + if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, 0))) return(ptr); /* @@ -416,11 +435,11 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigne void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) { - pg_data_t *pgdat = pgdat_list; + bootmem_data_t *bdata; void *ptr; - for_each_pgdat(pgdat) - if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, + list_for_each_entry(bdata, &bdata_list, list) + if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, LOW32LIMIT))) return(ptr); From ec936fc563715a9e2b2e363eb060655b49529325 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:59 -0800 Subject: [PATCH 113/306] [PATCH] for_each_online_pgdat: renaming for_each_pgdat Replace for_each_pgdat() with for_each_online_pgdat(). Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/mm/pgtable.c | 2 +- arch/ia64/mm/discontig.c | 4 ++-- arch/ia64/mm/init.c | 2 +- arch/m32r/mm/init.c | 2 +- arch/powerpc/mm/mem.c | 4 ++-- arch/x86_64/mm/init.c | 2 +- fs/buffer.c | 2 +- mm/page_alloc.c | 6 +++--- mm/vmscan.c | 6 +++--- 9 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index 9db3242103be..2889567e21a1 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -36,7 +36,7 @@ void show_mem(void) printk(KERN_INFO "Mem-info:\n"); show_free_areas(); printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pgdat_page_nr(pgdat, i); diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 2f5e44862e91..384f1d7dce96 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -386,7 +386,7 @@ static void __init pgdat_insert(pg_data_t *pgdat) { pg_data_t *prev = NULL, *next; - for_each_pgdat(next) + for_each_online_pgdat(next) if (pgdat->node_id < next->node_id) break; else @@ -560,7 +560,7 @@ void show_mem(void) printk("Mem-info:\n"); show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { unsigned long present; unsigned long flags; int shared = 0, cached = 0, reserved = 0; diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index ff4f31fcd330..2ef1151cde90 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -600,7 +600,7 @@ mem_init (void) kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START); kclist_add(&kcore_kernel, _stext, _end - _stext); - for_each_pgdat(pgdat) + for_each_online_pgdat(pgdat) if (pgdat->bdata->node_bootmem_map) totalram_pages += free_all_bootmem_node(pgdat); diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c index c9e7dad860b7..2e0fe199ce38 100644 --- a/arch/m32r/mm/init.c +++ b/arch/m32r/mm/init.c @@ -47,7 +47,7 @@ void show_mem(void) printk("Mem-info:\n"); show_free_areas(); printk("Free swap: %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { unsigned long flags; pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index badac10d700c..5e435a9c3431 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -195,7 +195,7 @@ void show_mem(void) printk("Mem-info:\n"); show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { unsigned long flags; pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; i++) { @@ -351,7 +351,7 @@ void __init mem_init(void) max_mapnr = max_pfn; totalram_pages += free_all_bootmem(); #endif - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { for (i = 0; i < pgdat->node_spanned_pages; i++) { if (!pfn_valid(pgdat->node_start_pfn + i)) continue; diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index b04415625442..e5f7f1c34462 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -72,7 +72,7 @@ void show_mem(void) show_free_areas(); printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pfn_to_page(pgdat->node_start_pfn + i); total++; diff --git a/fs/buffer.c b/fs/buffer.c index d597758dd129..23f1f3a68077 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -493,7 +493,7 @@ static void free_more_memory(void) wakeup_pdflush(1024); yield(); - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones; if (*zones) try_to_free_pages(zones, GFP_NOFS); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8dc8f2735d22..ccc3713dd407 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1201,7 +1201,7 @@ unsigned int nr_free_highpages (void) pg_data_t *pgdat; unsigned int pages = 0; - for_each_pgdat(pgdat) + for_each_online_pgdat(pgdat) pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; return pages; @@ -1343,7 +1343,7 @@ void get_zone_counts(unsigned long *active, *active = 0; *inactive = 0; *free = 0; - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { unsigned long l, m, n; __get_zone_counts(&l, &m, &n, pgdat); *active += l; @@ -2482,7 +2482,7 @@ static void setup_per_zone_lowmem_reserve(void) struct pglist_data *pgdat; int j, idx; - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long present_pages = zone->present_pages; diff --git a/mm/vmscan.c b/mm/vmscan.c index 78865c849f8f..acdf001d6941 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1305,7 +1305,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) current->reclaim_state = &reclaim_state; repeat: - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { unsigned long freed; freed = balance_pgdat(pgdat, nr_to_free, 0); @@ -1335,7 +1335,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb, cpumask_t mask; if (action == CPU_ONLINE) { - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { mask = node_to_cpumask(pgdat->node_id); if (any_online_cpu(mask) != NR_CPUS) /* One of our CPUs online: restore mask */ @@ -1351,7 +1351,7 @@ static int __init kswapd_init(void) pg_data_t *pgdat; swap_setup(); - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { pid_t pid; pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); From 3571761fe49d960bb720c2308ffb9401f0a5e161 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:16:00 -0800 Subject: [PATCH 114/306] [PATCH] for_each_online_pgdat: remove sorting pgdat Because pgdat_list was linked to pgdat_list in *reverse* order, (By default) some of arch has to sort it by themselves. for_each_pgdat has gone..for_each_online_pgdat() uses node_online_map, which doesn't need to be sorted. This patch removes codes for sorting pgdat. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/mm/discontig.c | 11 ----------- arch/ia64/mm/discontig.c | 31 ------------------------------- arch/m32r/mm/discontig.c | 6 ------ 3 files changed, 48 deletions(-) diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c index c4af9638dbfa..c3f3ae95e22d 100644 --- a/arch/i386/mm/discontig.c +++ b/arch/i386/mm/discontig.c @@ -352,17 +352,6 @@ void __init zone_sizes_init(void) { int nid; - /* - * Insert nodes into pgdat_list backward so they appear in order. - * Clobber node 0's links and NULL out pgdat_list before starting. - */ - pgdat_list = NULL; - for (nid = MAX_NUMNODES - 1; nid >= 0; nid--) { - if (!node_online(nid)) - continue; - NODE_DATA(nid)->pgdat_next = pgdat_list; - pgdat_list = NODE_DATA(nid); - } for_each_online_node(nid) { unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 384f1d7dce96..ec9eeb89975d 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -378,31 +378,6 @@ static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize) return ptr; } -/** - * pgdat_insert - insert the pgdat into global pgdat_list - * @pgdat: the pgdat for a node. - */ -static void __init pgdat_insert(pg_data_t *pgdat) -{ - pg_data_t *prev = NULL, *next; - - for_each_online_pgdat(next) - if (pgdat->node_id < next->node_id) - break; - else - prev = next; - - if (prev) { - prev->pgdat_next = pgdat; - pgdat->pgdat_next = next; - } else { - pgdat->pgdat_next = pgdat_list; - pgdat_list = pgdat; - } - - return; -} - /** * memory_less_nodes - allocate and initialize CPU only nodes pernode * information. @@ -745,11 +720,5 @@ void __init paging_init(void) pfn_offset, zholes_size); } - /* - * Make memory less nodes become a member of the known nodes. - */ - for_each_node_mask(node, memory_less_mask) - pgdat_insert(mem_data[node].pgdat); - zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } diff --git a/arch/m32r/mm/discontig.c b/arch/m32r/mm/discontig.c index 08e727955555..70c8528a0ad5 100644 --- a/arch/m32r/mm/discontig.c +++ b/arch/m32r/mm/discontig.c @@ -137,12 +137,6 @@ unsigned long __init zone_sizes_init(void) int nid, i; mem_prof_t *mp; - pgdat_list = NULL; - for (nid = num_online_nodes() - 1 ; nid >= 0 ; nid--) { - NODE_DATA(nid)->pgdat_next = pgdat_list; - pgdat_list = NODE_DATA(nid); - } - for_each_online_node(nid) { mp = &mem_prof[nid]; for (i = 0 ; i < MAX_NR_ZONES ; i++) { From ae0f15fb91274e67d78836d38c99ec363df33073 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:16:01 -0800 Subject: [PATCH 115/306] [PATCH] for_each_online_pgdat: remove pgdat_list By using for_each_online_pgdat(), pgdat_list is not necessary now. This patch removes it. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 3 --- mm/page_alloc.c | 8 ++++---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 96eb08025092..0d12c3cf1f86 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -307,7 +307,6 @@ typedef struct pglist_data { unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; - struct pglist_data *pgdat_next; wait_queue_head_t kswapd_wait; struct task_struct *kswapd; int kswapd_max_order; @@ -324,8 +323,6 @@ typedef struct pglist_data { #include -extern struct pglist_data *pgdat_list; - void __get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free, struct pglist_data *pgdat); void get_zone_counts(unsigned long *active, unsigned long *inactive, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ccc3713dd407..dc523a1f270d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -49,7 +49,6 @@ nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; EXPORT_SYMBOL(node_online_map); nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; EXPORT_SYMBOL(node_possible_map); -struct pglist_data *pgdat_list __read_mostly; unsigned long totalram_pages __read_mostly; unsigned long totalhigh_pages __read_mostly; long nr_swap_pages; @@ -2169,8 +2168,9 @@ static void *frag_start(struct seq_file *m, loff_t *pos) { pg_data_t *pgdat; loff_t node = *pos; - - for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) + for (pgdat = first_online_pgdat(); + pgdat && node; + pgdat = next_online_pgdat(pgdat)) --node; return pgdat; @@ -2181,7 +2181,7 @@ static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) pg_data_t *pgdat = (pg_data_t *)arg; (*pos)++; - return pgdat->pgdat_next; + return next_online_pgdat(pgdat); } static void frag_stop(struct seq_file *m, void *arg) From 95144c788dc01b6a0ff2c9c2222e37ffdab358b8 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:16:02 -0800 Subject: [PATCH 116/306] [PATCH] uninline zone helpers Helper functions for for_each_online_pgdat/for_each_zone look too big to be inlined. Speed of these helper macro itself is not very important. (inner loops are tend to do more work than this) This patch make helper function to be out-of-lined. inline out-of-line .text 005c0680 005bf6a0 005c0680 - 005bf6a0 = FE0 = 4Kbytes. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 38 +++----------------------------- mm/Makefile | 2 +- mm/mmzone.c | 50 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 36 deletions(-) create mode 100644 mm/mmzone.c diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0d12c3cf1f86..b5c21122c299 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -418,20 +418,9 @@ extern struct pglist_data contig_page_data; #endif /* !CONFIG_NEED_MULTIPLE_NODES */ -static inline struct pglist_data *first_online_pgdat(void) -{ - return NODE_DATA(first_online_node); -} - -static inline struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) -{ - int nid = next_online_node(pgdat->node_id); - - if (nid == MAX_NUMNODES) - return NULL; - return NODE_DATA(nid); -} - +extern struct pglist_data *first_online_pgdat(void); +extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); +extern struct zone *next_zone(struct zone *zone); /** * for_each_pgdat - helper macro to iterate over all nodes @@ -441,27 +430,6 @@ static inline struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) for (pgdat = first_online_pgdat(); \ pgdat; \ pgdat = next_online_pgdat(pgdat)) - -/* - * next_zone - helper magic for for_each_zone() - * Thanks to William Lee Irwin III for this piece of ingenuity. - */ -static inline struct zone *next_zone(struct zone *zone) -{ - pg_data_t *pgdat = zone->zone_pgdat; - - if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) - zone++; - else { - pgdat = next_online_pgdat(pgdat); - if (pgdat) - zone = pgdat->node_zones; - else - zone = NULL; - } - return zone; -} - /** * for_each_zone - helper macro to iterate over all memory zones * @zone - pointer to struct zone variable diff --git a/mm/Makefile b/mm/Makefile index f10c753dce6d..0b8f73f2ed16 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -10,7 +10,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ page_alloc.o page-writeback.o pdflush.o \ readahead.o swap.o truncate.o vmscan.o \ - prio_tree.o util.o $(mmu-y) + prio_tree.o util.o mmzone.o $(mmu-y) obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o diff --git a/mm/mmzone.c b/mm/mmzone.c new file mode 100644 index 000000000000..b022370e612e --- /dev/null +++ b/mm/mmzone.c @@ -0,0 +1,50 @@ +/* + * linux/mm/mmzone.c + * + * management codes for pgdats and zones. + */ + + +#include +#include +#include +#include + +struct pglist_data *first_online_pgdat(void) +{ + return NODE_DATA(first_online_node); +} + +EXPORT_SYMBOL(first_online_pgdat); + +struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) +{ + int nid = next_online_node(pgdat->node_id); + + if (nid == MAX_NUMNODES) + return NULL; + return NODE_DATA(nid); +} +EXPORT_SYMBOL(next_online_pgdat); + + +/* + * next_zone - helper magic for for_each_zone() + */ +struct zone *next_zone(struct zone *zone) +{ + pg_data_t *pgdat = zone->zone_pgdat; + + if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) + zone++; + else { + pgdat = next_online_pgdat(pgdat); + if (pgdat) + zone = pgdat->node_zones; + else + zone = NULL; + } + return zone; +} +EXPORT_SYMBOL(next_zone); + From 22a9835c350782a5c3257343713932af3ac92ee0 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 27 Mar 2006 01:16:04 -0800 Subject: [PATCH 117/306] [PATCH] unify PFN_* macros Just about every architecture defines some macros to do operations on pfns. They're all virtually identical. This patch consolidates all of them. One minor glitch is that at least i386 uses them in a very skeletal header file. To keep away from #include dependency hell, I stuck the new definitions in a new, isolated header. Of all of the implementations, sh64 is the only one that varied by a bit. It used some masks to ensure that any sign-extension got ripped away before the arithmetic is done. This has been posted to that sh64 maintainers and the development list. Compiles on x86, x86_64, ia64 and ppc64. Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/setup.c | 9 +-------- arch/alpha/mm/numa.c | 4 +--- arch/arm26/mm/init.c | 7 +------ arch/cris/kernel/setup.c | 5 +---- arch/i386/kernel/setup.c | 1 + arch/i386/mm/discontig.c | 1 + arch/m32r/kernel/setup.c | 1 + arch/m32r/mm/discontig.c | 1 + arch/m32r/mm/init.c | 1 + arch/mips/ite-boards/ivr/init.c | 3 --- arch/mips/ite-boards/qed-4n-s01b/init.c | 3 --- arch/mips/kernel/setup.c | 9 +-------- arch/mips/mips-boards/generic/memory.c | 7 ++----- arch/mips/mips-boards/sim/sim_mem.c | 7 ++----- arch/mips/mm/init.c | 4 +--- arch/mips/sgi-ip27/ip27-memory.c | 3 +-- arch/sh/kernel/setup.c | 5 +---- arch/sh64/kernel/setup.c | 1 + arch/um/kernel/physmem.c | 3 +-- include/asm-i386/setup.h | 4 +--- include/asm-m32r/setup.h | 4 ---- include/asm-sh64/platform.h | 5 ----- include/linux/pfn.h | 9 +++++++++ 23 files changed, 29 insertions(+), 68 deletions(-) create mode 100644 include/linux/pfn.h diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index b4e5f8ff2b25..9402624453c2 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -34,6 +34,7 @@ #include #include #include +#include #ifdef CONFIG_MAGIC_SYSRQ #include #include @@ -241,9 +242,6 @@ reserve_std_resources(void) request_resource(io, standard_io_resources+i); } -#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) -#define PFN_DOWN(x) ((x) >> PAGE_SHIFT) -#define PFN_PHYS(x) ((x) << PAGE_SHIFT) #define PFN_MAX PFN_DOWN(0x80000000) #define for_each_mem_cluster(memdesc, cluster, i) \ for ((cluster) = (memdesc)->cluster, (i) = 0; \ @@ -472,11 +470,6 @@ page_is_ram(unsigned long pfn) return 0; } -#undef PFN_UP -#undef PFN_DOWN -#undef PFN_PHYS -#undef PFN_MAX - void __init setup_arch(char **cmdline_p) { diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c index 6d5251254f68..bf6b65c81bef 100644 --- a/arch/alpha/mm/numa.c +++ b/arch/alpha/mm/numa.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -27,9 +28,6 @@ bootmem_data_t node_bdata[MAX_NUMNODES]; #define DBGDCONT(args...) #endif -#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) -#define PFN_DOWN(x) ((x) >> PAGE_SHIFT) -#define PFN_PHYS(x) ((x) << PAGE_SHIFT) #define for_each_mem_cluster(memdesc, cluster, i) \ for ((cluster) = (memdesc)->cluster, (i) = 0; \ (i) < (memdesc)->numclusters; (i)++, (cluster)++) diff --git a/arch/arm26/mm/init.c b/arch/arm26/mm/init.c index e3ecaa453747..7da8a5205678 100644 --- a/arch/arm26/mm/init.c +++ b/arch/arm26/mm/init.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -101,12 +102,6 @@ struct node_info { int bootmap_pages; }; -#define PFN_DOWN(x) ((x) >> PAGE_SHIFT) -#define PFN_UP(x) (PAGE_ALIGN(x) >> PAGE_SHIFT) -#define PFN_SIZE(x) ((x) >> PAGE_SHIFT) -#define PFN_RANGE(s,e) PFN_SIZE(PAGE_ALIGN((unsigned long)(e)) - \ - (((unsigned long)(s)) & PAGE_MASK)) - /* * FIXME: We really want to avoid allocating the bootmap bitmap * over the top of the initrd. Hopefully, this is located towards diff --git a/arch/cris/kernel/setup.c b/arch/cris/kernel/setup.c index 1ba57efff60d..619a6eefd893 100644 --- a/arch/cris/kernel/setup.c +++ b/arch/cris/kernel/setup.c @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -88,10 +89,6 @@ setup_arch(char **cmdline_p) init_mm.end_data = (unsigned long) &_edata; init_mm.brk = (unsigned long) &_end; -#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) -#define PFN_DOWN(x) ((x) >> PAGE_SHIFT) -#define PFN_PHYS(x) ((x) << PAGE_SHIFT) - /* min_low_pfn points to the start of DRAM, start_pfn points * to the first DRAM pages after the kernel, and max_low_pfn * to the end of DRAM. diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 6917daa159ab..8c08660b4e5d 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -46,6 +46,7 @@ #include #include #include +#include #include