Sophie

Sophie

distrib > Mandriva > current > x86_64 > by-pkgid > d41cc2729f5f0fbbd2607b14a49457f3 > files > 62

kernel-xen-2.6.32.11-2mdv2010.1.src.rpm

From: jbeulich@novell.com
Subject: reduce contention on xtime_lock
Patch-mainline: n/a
References: bnc#569014, bnc#571041, bnc#571769, bnc#572146

Especially on large systems the number of CPUs queueing up on
xtime_lock may become signficiant, and (as reported in the bugs above)
may even prevent proper operation of the system when Xen is using
deep C-states. There is, however, no need for all CPUs in the system
to update global time - it is sufficient to have a single (at any given
point in time) CPU being responsible for this.

Also, while touching that code, avoid calling printk() with xtime_lock
held.

--- sle11sp1-2010-02-17.orig/arch/x86/kernel/time-xen.c	2010-02-18 17:30:18.000000000 +0100
+++ sle11sp1-2010-02-17/arch/x86/kernel/time-xen.c	2010-02-18 17:33:07.000000000 +0100
@@ -58,6 +58,7 @@ static u32 shadow_tv_version;
 /* Keep track of last time we did processing/updating of jiffies and xtime. */
 static u64 processed_system_time;   /* System time (ns) at last processing. */
 static DEFINE_PER_CPU(u64, processed_system_time);
+static DEFINE_PER_CPU(u64, accounted_system_time);
 
 /* How much CPU time was spent blocked and how much was 'stolen'? */
 static DEFINE_PER_CPU(u64, processed_stolen_time);
@@ -123,6 +124,19 @@ static int __init __permitted_clock_jitt
 __setup("permitted_clock_jitter=", __permitted_clock_jitter);
 
 /*
+ * Limit on the number of CPUs that may concurrently attempt to acquire
+ * xtime_lock in timer_interrupt() (reducing contention potentially leading
+ * to a live lock on systems with many CPUs.
+ */
+static unsigned int __read_mostly duty_limit = -2;
+static int __init set_duty_limit(char *str)
+{
+	duty_limit = simple_strtoul(str, NULL, 0) - 1;
+	return 1;
+}
+__setup("timer_duty_limit=", set_duty_limit);
+
+/*
  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
  * yielding a 64-bit result.
  */
@@ -422,9 +436,11 @@ EXPORT_SYMBOL(profile_pc);
  */
 static irqreturn_t timer_interrupt(int irq, void *dev_id)
 {
+	static unsigned int contention_count;
 	s64 delta, delta_cpu, stolen, blocked;
 	unsigned int i, cpu = smp_processor_id();
 	struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+	bool duty = false;
 	struct vcpu_runstate_info runstate;
 
 	/* Keep nmi watchdog up to date */
@@ -437,7 +453,13 @@ static irqreturn_t timer_interrupt(int i
 	 * the irq version of write_lock because as just said we have irq
 	 * locally disabled. -arca
 	 */
-	write_seqlock(&xtime_lock);
+	asm (LOCK_PREFIX "xaddl %1, %0"
+	     : "+m" (contention_count), "=r" (i) : "1" (1));
+	if (i <= duty_limit) {
+		write_seqlock(&xtime_lock);
+		duty = true;
+	}
+	asm (LOCK_PREFIX "decl %0" : "+m" (contention_count));
 
 	do {
 		get_time_values_from_xen(cpu);
@@ -451,40 +473,63 @@ static irqreturn_t timer_interrupt(int i
 		get_runstate_snapshot(&runstate);
 	} while (!time_values_up_to_date());
 
-	if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
-	     unlikely(delta_cpu < -(s64)permitted_clock_jitter))
-	    && printk_ratelimit()) {
-		printk("Timer ISR/%u: Time went backwards: "
-		       "delta=%lld delta_cpu=%lld shadow=%lld "
-		       "off=%lld processed=%lld cpu_processed=%lld\n",
-		       cpu, delta, delta_cpu, shadow->system_timestamp,
-		       (s64)get_nsec_offset(shadow),
-		       processed_system_time,
-		       per_cpu(processed_system_time, cpu));
-		for (i = 0; i < num_online_cpus(); i++)
-			printk(" %d: %lld\n", i,
-			       per_cpu(processed_system_time, i));
-	}
+	if (duty && unlikely(delta < -(s64)permitted_clock_jitter)) {
+		blocked = processed_system_time;
+		write_sequnlock(&xtime_lock);
+		if (printk_ratelimit()) {
+			printk("Timer ISR/%u: Time went backwards: "
+			       "delta=%Ld/%Ld shadow=%Lx off=%Lx "
+			       "processed=%Lx/%Lx\n",
+			       cpu, delta, delta_cpu, shadow->system_timestamp,
+			       get_nsec_offset(shadow), blocked,
+			       per_cpu(processed_system_time, cpu));
+			for_each_cpu_and(i, cpu_online_mask, cpumask_of(cpu))
+				printk(" %u: %Lx\n", i,
+				       per_cpu(processed_system_time, i));
+		}
+	} else if (unlikely(delta_cpu < -(s64)permitted_clock_jitter)) {
+		blocked = processed_system_time;
+		if (duty)
+			write_sequnlock(&xtime_lock);
+		if (printk_ratelimit()) {
+			printk("Timer ISR/%u: Time went backwards: delta=%Ld"
+			       " shadow=%Lx off=%Lx processed=%Lx/%Lx\n",
+			       cpu, delta_cpu, shadow->system_timestamp,
+			       get_nsec_offset(shadow), blocked,
+			       per_cpu(processed_system_time, cpu));
+			for_each_cpu_and(i, cpu_online_mask, cpumask_of(cpu))
+				printk(" %u: %Lx\n", i,
+				       per_cpu(processed_system_time, i));
+		}
+	} else if (duty) {
+		/* System-wide jiffy work. */
+		if (delta >= NS_PER_TICK) {
+			do_div(delta, NS_PER_TICK);
+			processed_system_time += delta * NS_PER_TICK;
+			while (delta > HZ) {
+				clobber_induction_variable(delta);
+				do_timer(HZ);
+				delta -= HZ;
+			}
+			do_timer(delta);
+		}
 
-	/* System-wide jiffy work. */
-	if (delta >= NS_PER_TICK) {
-		do_div(delta, NS_PER_TICK);
-		processed_system_time += delta * NS_PER_TICK;
-		while (delta > HZ) {
-			clobber_induction_variable(delta);
-			do_timer(HZ);
-			delta -= HZ;
+		if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
+			update_wallclock();
+			if (keventd_up())
+				schedule_work(&clock_was_set_work);
 		}
-		do_timer(delta);
-	}
 
-	if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
-		update_wallclock();
-		if (keventd_up())
-			schedule_work(&clock_was_set_work);
+		write_sequnlock(&xtime_lock);
 	}
 
-	write_sequnlock(&xtime_lock);
+	delta = delta_cpu;
+	delta_cpu += per_cpu(processed_system_time, cpu)
+		   - per_cpu(accounted_system_time, cpu);
+	if (delta >= NS_PER_TICK) {
+		do_div(delta, NS_PER_TICK);
+		per_cpu(processed_system_time, cpu) += delta * NS_PER_TICK;
+	}
 
 	/*
 	 * Account stolen ticks.
@@ -499,7 +544,7 @@ static irqreturn_t timer_interrupt(int i
 			stolen += delta_cpu; /* clamp local-time progress */
 		do_div(stolen, NS_PER_TICK);
 		per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
-		per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
+		per_cpu(accounted_system_time, cpu) += stolen * NS_PER_TICK;
 		account_steal_ticks(stolen);
 	}
 
@@ -515,14 +560,14 @@ static irqreturn_t timer_interrupt(int i
 			blocked += delta_cpu; /* clamp local-time progress */
 		do_div(blocked, NS_PER_TICK);
 		per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
-		per_cpu(processed_system_time, cpu)  += blocked * NS_PER_TICK;
+		per_cpu(accounted_system_time, cpu)  += blocked * NS_PER_TICK;
 		account_idle_ticks(blocked);
 	}
 
 	/* Account user/system ticks. */
 	if (delta_cpu > 0) {
 		do_div(delta_cpu, NS_PER_TICK);
-		per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
+		per_cpu(accounted_system_time, cpu) += delta_cpu * NS_PER_TICK;
 		if (user_mode_vm(get_irq_regs()))
 			account_user_time(current, (cputime_t)delta_cpu,
 					  (cputime_t)delta_cpu);
@@ -623,6 +668,7 @@ static void xen_clocksource_resume(void)
 			BUG();
 		}
 		get_time_values_from_xen(cpu);
+		per_cpu(accounted_system_time, cpu) =
 		per_cpu(processed_system_time, cpu) =
 			per_cpu(shadow_time, 0).system_timestamp;
 		init_missing_ticks_accounting(cpu);
@@ -725,6 +771,7 @@ void __init time_init(void)
 
 	processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
 	per_cpu(processed_system_time, 0) = processed_system_time;
+	per_cpu(accounted_system_time, 0) = processed_system_time;
 	init_missing_ticks_accounting(0);
 
 	clocksource_register(&clocksource_xen);
@@ -735,6 +782,9 @@ void __init time_init(void)
 
 	/* Cannot request_irq() until kmem is initialised. */
 	late_time_init = setup_cpu0_timer_irq;
+
+	if (!(duty_limit + 2))
+		duty_limit = __fls(nr_cpu_ids);
 }
 
 /* Convert jiffies to system time. */
@@ -773,6 +823,7 @@ static void stop_hz_timer(void)
 	struct vcpu_set_singleshot_timer singleshot;
 	unsigned int cpu = smp_processor_id();
 	unsigned long j;
+	u64 local;
 	int rc;
 
 	cpumask_set_cpu(cpu, nohz_cpu_mask);
@@ -798,6 +849,11 @@ static void stop_hz_timer(void)
 	singleshot.timeout_abs_ns = jiffies_to_st(j);
 	if (!singleshot.timeout_abs_ns)
 		return;
+	local = per_cpu(processed_system_time, cpu);
+	if ((s64)(singleshot.timeout_abs_ns - local) <= NS_PER_TICK) {
+		cpumask_clear_cpu(cpu, nohz_cpu_mask);
+		singleshot.timeout_abs_ns = local + NS_PER_TICK;
+	}
 	singleshot.timeout_abs_ns += NS_PER_TICK / 2;
 	singleshot.flags = 0;
 	rc = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &singleshot);
@@ -862,6 +918,7 @@ int __cpuinit local_setup_timer(unsigned
 	do {
 		seq = read_seqbegin(&xtime_lock);
 		/* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
+		per_cpu(accounted_system_time, cpu) =
 		per_cpu(processed_system_time, cpu) =
 			per_cpu(shadow_time, 0).system_timestamp;
 		init_missing_ticks_accounting(cpu);