File 20122-credit-accounting.patch of Package xen

# HG changeset patch
# User Keir Fraser <keir.fraser@citrix.com>
# Date 1251210997 -3600
# Node ID 8faef78ea759d9bea56bea4da9c2ec8d40869bb3
# Parent  9cab2b654cb4a27b77dc2ce53ab1d76514a1e9f5
Accurate accounting for credit scheduler

Rather than debit a full 10ms of credit on a scheduler tick
(probabilistic), debit credits accurately based on time stamps.

The main problem this is meant to address is an attack on the
scheduler that allows a rogue guest to avoid ever being debited
credits.  The basic idea is that the rogue process checks time (using
rdtsc) periodically, and yields after 9.5ms.  Using this technique, a
guest can "steal" 95% of the cpu.  This is particularly an issue in
cloud environments.

Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>

# HG changeset patch
# User Keir Fraser <keir.fraser@citrix.com>
# Date 1254471027 -3600
# Node ID 6f63970032a3fe5fda585beba929d2d7a5e14861
# Parent  fd3d5d66c446b3216e5e6180efd1f01db9a1b47c
credit scheduler: fix credits overflow

In changing credits-per-tick from 100 to 1000000, a possible overflow
was introduced in the accounting algorithm, when credit totals (which
can be in the millions) gets multiplied by a weight (typically 256):
th eresult can easily overflow a signed 32-bit variable.

Fix this by reverting to 100 credits per tick, and maintain long-term
fairness/correctness by tracking at the nanosecond level exactly how
much execution time has been accounted to each VCPU. We do this by
rounding execution time so far to nearest number of credits, but then
remember the VCPU's 'partial credit balance'.

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>

--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -42,11 +42,11 @@
 #define CSCHED_MSECS_PER_TICK       10
 #define CSCHED_MSECS_PER_TSLICE     \
     (CSCHED_MSECS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
-#define CSCHED_CREDITS_PER_TICK     100
+#define CSCHED_CREDITS_PER_MSEC     10
 #define CSCHED_CREDITS_PER_TSLICE   \
-    (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
+    (CSCHED_CREDITS_PER_MSEC * CSCHED_MSECS_PER_TSLICE)
 #define CSCHED_CREDITS_PER_ACCT     \
-    (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_ACCT)
+    (CSCHED_CREDITS_PER_MSEC * CSCHED_MSECS_PER_TICK * CSCHED_TICKS_PER_ACCT)
 
 
 /*
@@ -200,6 +200,7 @@ struct csched_vcpu {
     struct csched_dom *sdom;
     struct vcpu *vcpu;
     atomic_t credit;
+    s_time_t start_time;   /* When we were scheduled (used for credit) */
     uint16_t flags;
     int16_t pri;
 #ifdef CSCHED_STATS
@@ -288,6 +289,22 @@ __runq_remove(struct csched_vcpu *svc)
     list_del_init(&svc->runq_elem);
 }
 
+static void burn_credits(struct csched_vcpu *svc, s_time_t now)
+{
+    s_time_t delta;
+    unsigned int credits;
+
+    /* Assert svc is current */
+    ASSERT(svc==CSCHED_VCPU(per_cpu(schedule_data, svc->vcpu->processor).curr));
+
+    if ( (delta = now - svc->start_time) <= 0 )
+        return;
+
+    credits = (delta*CSCHED_CREDITS_PER_MSEC + MILLISECS(1)/2) / MILLISECS(1);
+    atomic_sub(credits, &svc->credit);
+    svc->start_time += (credits * MILLISECS(1)) / CSCHED_CREDITS_PER_MSEC;
+}
+
 static inline void
 __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
 {
@@ -346,6 +363,7 @@ csched_pcpu_init(int cpu)
     spc = xmalloc(struct csched_pcpu);
     if ( spc == NULL )
         return -1;
+    memset(spc, 0, sizeof(*spc));
 
     spin_lock_irqsave(&csched_priv.lock, flags);
 
@@ -562,7 +580,8 @@ csched_vcpu_acct(unsigned int cpu)
     /*
      * Update credits
      */
-    atomic_sub(CSCHED_CREDITS_PER_TICK, &svc->credit);
+    if ( !is_idle_vcpu(svc->vcpu) )
+        burn_credits(svc, NOW());
 
     /*
      * Put this VCPU and domain back on the active list if it was
@@ -598,6 +617,7 @@ csched_vcpu_init(struct vcpu *vc)
     svc = xmalloc(struct csched_vcpu);
     if ( svc == NULL )
         return -1;
+    memset(svc, 0, sizeof(*svc));
 
     INIT_LIST_HEAD(&svc->runq_elem);
     INIT_LIST_HEAD(&svc->active_vcpu_elem);
@@ -763,6 +783,7 @@ csched_dom_init(struct domain *dom)
     sdom = xmalloc(struct csched_dom);
     if ( sdom == NULL )
         return -ENOMEM;
+    memset(sdom, 0, sizeof(*sdom));
 
     /* Initialize credit and weight */
     INIT_LIST_HEAD(&sdom->active_vcpu);
@@ -1190,6 +1211,13 @@ csched_schedule(s_time_t now)
     CSCHED_STAT_CRANK(schedule);
     CSCHED_VCPU_CHECK(current);
 
+    /* Update credits */
+    if ( !is_idle_vcpu(scurr->vcpu) )
+    {
+        burn_credits(scurr, now);
+        scurr->start_time -= now;
+    }
+
     /*
      * Select next runnable local VCPU (ie top of local runq)
      */
@@ -1227,6 +1255,9 @@ csched_schedule(s_time_t now)
         cpu_clear(cpu, csched_priv.idlers);
     }
 
+    if ( !is_idle_vcpu(snext->vcpu) )
+        snext->start_time += now;
+
     /*
      * Return task to run next...
      */
@@ -1320,7 +1351,7 @@ csched_dump(void)
            "\trunq_sort          = %u\n"
            "\tdefault-weight     = %d\n"
            "\tmsecs per tick     = %dms\n"
-           "\tcredits per tick   = %d\n"
+           "\tcredits per msec   = %d\n"
            "\tticks per tslice   = %d\n"
            "\tticks per acct     = %d\n"
            "\tmigration delay    = %uus\n",
@@ -1332,7 +1363,7 @@ csched_dump(void)
            csched_priv.runq_sort,
            CSCHED_DEFAULT_WEIGHT,
            CSCHED_MSECS_PER_TICK,
-           CSCHED_CREDITS_PER_TICK,
+           CSCHED_CREDITS_PER_MSEC,
            CSCHED_TICKS_PER_TSLICE,
            CSCHED_TICKS_PER_ACCT,
            vcpu_migration_delay);
openSUSE Build Service is sponsored by