[PATCH] [perf] sched spr-replay load record/playback. - linaro-dev

31 May 2012

This is an attempt to create a portable scheduler load recorder
and playback. While there exists a perf recorder and playback
functionality in mainline perf it is not very useful in trying to
debug the scheduler.
There are a number of problems that this patch solves.
1. The format used by default is the binary perf trace format which is
both non-portable between arches and non human editable. The format
spr-replay uses is text based and easily understandable, and completely
portable between arches & kernel version. It can been used to collect
load data from an android ICS setup and then have the load analyzed
in a normal linux x86 host.
2. For accurate load capture, it is important that the implementation
details of the original machine that the load was captured to be
removed as much as possible. For example if you capture a trace
of two cpu hogs in a single core machine, the amount of time the
processes were blocked waiting for the cpu should be removed, and
when the load replayed on a dual core machine both tasks should be
run in parallel. There is a switch to preserve the timings if you
desire to compare.
3. When capturing a load you get to see various possibly spurious
wakeups and interruptions by kernel tasks. It is important to be
possible to select only the tasks that are important, and then
have all the other tasks effects removed.
4. This patch allows capture of forking activity which is important
and affects the scheduler operation quite a lot.
Example use:
Record load to a perf.data file (on target)
$ perf record \
-a -R -f -m 8192 -c 1 \
-e sched:sched_switch \
-e sched:sched_process_exit \
-e sched:sched_process_fork \
-e sched:sched_wakeup \
-e sched:sched_migrate_task
List the tasks in the perf.data capture file (on target)
$ perf sched spr-replay -l -n
ksoftirqd/0/3] S R:7839561
[events/0/6] S R:16961982
[sync_supers/12] S R:31803
[bdi-default/13] S R:48753
....
Process the perf.data capture file to generate an spr load program (on target)
$ perf sched spr-replay -g -n >capture.spr
Execute the recorded load from the perf.data file (on target)
$ perf sched spr-replay [-d] # -d debugging mode
Copy the capture.spr file and execute it (on host)
$ perf sched spr-replay -f capture.spr
Process the perf.data capture file to generate an spr load program for
of a single (or more) tasks (on target).
$ perf sched spr-replay -g -n -s <task-pid|task-name|~regexp> >capture.spr
For example capturing cyclictest (cyclictest -t1 -p 80 -n -i 100000 -l 1000)
Generates a capture.spr program like this:
[cyclictest/4646]
burn 11703
sleep 99927688
burn 30594
sleep 99943711
burn 55277
sleep 99744841
burn 76059
sleep 99782827
burn 93262
sleep 99868136
burn 112340
sleep 99825780
burn 132841
sleep 99799446
burn 150585
sleep 99850653
burn 163754
sleep 80514018
burn 215810
exit 0
end
Process the perf.data capture file to generate an spr load program which
excludes load from a couple of kernel tasks:
$ perf sched spr-replay -g -n -x '~flush-*' -x '~events/*' \
-x '~ksoftirqd/*' -x swapper >capture.spr
Signed-off-by: Pantelis Antoniou panto@antoniou-consulting.com
---
 tools/perf/builtin-sched.c | 2352 +++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 2332 insertions(+), 20 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index b125e07..3b23062 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -22,6 +22,13 @@
 #include <semaphore.h>
 #include <pthread.h>
 #include <math.h>
+#include <regex.h>
+
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <linux/futex.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
static const char		*input_name;
@@ -43,6 +50,9 @@ static u64			sleep_measurement_overhead;
static unsigned long		nr_tasks;
+#define mb()			asm volatile ("" : : : "memory")
+static u64			bogoloops = 0;
+
 struct sched_atom;
struct task_desc {
@@ -61,6 +71,11 @@ struct task_desc {
    sem_t			work_done_sem;
u64			cpu_usage;
+
+	u64			task_state;
+
+	unsigned int		exited : 1;		/* exited */
+	unsigned int		selected : 1;		/* part of playback */
 };
enum sched_event_type {
@@ -68,6 +83,9 @@ enum sched_event_type {
    SCHED_EVENT_SLEEP,
    SCHED_EVENT_WAKEUP,
    SCHED_EVENT_MIGRATION,
+	SCHED_EVENT_EXIT,
+	SCHED_EVENT_FORK_PARENT,
+	SCHED_EVENT_FORK_CHILD,
 };
struct sched_atom {
@@ -78,11 +96,39 @@ struct sched_atom {
    unsigned long		nr;
    sem_t			*wait_sem;
    struct task_desc	*wakee;
+	struct sched_atom	*wakee_event;
+	u64			task_state;
+	int			pid;
+	struct task_desc	*parent;
+	struct task_desc	*child;
+
+	/* extra */
+	unsigned int		exited;	/* event was generated after task has called exit() */
+	char 			*msg;
+
+	unsigned int		waker_count;
+	struct task_desc	**wakers;
+	struct sched_atom	**waker_events;
+	unsigned long		wake_id;
 };
+static const char *sched_atom_str(const struct sched_atom *atom);
+
 static struct task_desc		*pid_to_task[MAX_PID];
+static u64			replay_start_time;
+static u64			replay_end_time;
 static struct task_desc		**tasks;
+static unsigned long		next_wake_id;
+static bool			preserve_time = false;
+static bool			dry_run = false;
+static bool			generate = false;
+static int 			debug = 0;
+static bool			spr_list = false;
+static bool			spr_select_list = false;
+static const char		*spr_filename = NULL;
+static bool			bogoburn = false;
+static bool			spr_replay = false;
static pthread_mutex_t		start_work_mutex = PTHREAD_MUTEX_INITIALIZER;
 static u64			start_time;
@@ -92,6 +138,9 @@ static pthread_mutex_t		work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER;
 static unsigned long		nr_run_events;
 static unsigned long		nr_sleep_events;
 static unsigned long		nr_wakeup_events;
+static unsigned long		nr_exit_events;
+static unsigned long		nr_fork_parent_events;
+static unsigned long		nr_fork_child_events;
static unsigned long		nr_sleep_corrections;
 static unsigned long		nr_run_events_optimized;
@@ -218,6 +267,120 @@ static void calibrate_sleep_measurement_overhead(void)
    printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
 }
+static u64 bogoloops_measure_single(u64 loops)
+{
+	u64 cnt;
+	u64 ns1, ns2;
+
+	/* loop to make sure you start close to the monotonic clock */
+	ns2 = get_nsecs();
+	while ((ns1 = get_nsecs()) - ns2 == 0)
+		;
+	cnt = loops;
+	while (cnt-- > 0)
+		mb();
+	ns2 = get_nsecs();
+	return ns2 - ns1;
+}
+
+static u64 bogoloops_measure_stable(u64 loops)
+{
+	int i;
+	unsigned int j, k;
+	u64 delta;
+	u64 delta_samples[16];		/* 16 is an adequate number */
+	u64 estimate, maxdiff, new_estimate;
+	int imin, imax;	/* indices of min & max */
+
+	i = 0;
+	do {
+		imin = 0;
+		imax = 0;
+		estimate = 0;
+		for (j = 0; j < ARRAY_SIZE(delta_samples); j++) {
+			/* measure a single time */
+			delta = bogoloops_measure_single(loops);
+			delta_samples[j] = delta;
+			if (delta < delta_samples[imin])
+				imin = j;
+			if (delta > delta_samples[imax])
+				imax = j;
+			estimate += delta;
+		}
+		estimate /= ARRAY_SIZE(delta_samples);
+
+		maxdiff = estimate >> 3;	/* maxdiff is 12.5% of estimate */
+
+		/* only keep the ones that are with the proper range */
+		k = 0;
+		new_estimate = 0;
+		for (j = 0; j < ARRAY_SIZE(delta_samples); j++) {
+			if (delta_samples[j] > estimate)
+				delta = delta_samples[j] - estimate;
+			else
+				delta = estimate - delta_samples[j];
+
+			/* too bad, continue */
+			if (delta > maxdiff)
+				continue;
+			k++;
+			new_estimate += delta_samples[j];
+		}
+
+		/* we need at least half the samples to be right */
+		if (k >= ARRAY_SIZE(delta_samples) / 2)
+			return new_estimate / k;
+
+	} while (i++ < 8);	/* try for 8 times */
+
+	/* we have failed to find a stable measurement */
+	/* (this is normal for low loops values) */
+	return 0;
+}
+
+static void calculate_bogoloops_value(void)
+{
+	u64 loops, last_loops, new_loops;
+	u64 sample_period;
+	u64 ns1, ns2, delta, delta_new, delta_diff;
+
+	/* 1 ms */
+	sample_period = 1000000;
+
+	ns1 = get_nsecs();
+	loops = 0;
+	while (((ns2 = get_nsecs()) - ns1) < sample_period)
+		loops++;
+
+	/* it is guaranteed that the loops value would be lower */
+	/* we grow until we get right after the sample period */
+	/* then we interpolate for the period */
+	do {
+		last_loops = loops;
+		delta = bogoloops_measure_stable(loops);
+		loops *= 2;
+	} while (delta < sample_period);
+
+	new_loops = ((last_loops * sample_period) + (delta / 2)) / delta;
+	delta_new = bogoloops_measure_stable(new_loops);
+
+	if (delta_new > sample_period)
+		delta_diff = delta_new - sample_period;
+	else
+		delta_diff = sample_period - delta_new;
+
+	bogoloops = new_loops;
+}
+
+static void bogoburn_nsecs(u64 nsecs)
+{
+	u64 cnt;
+
+	cnt = (bogoloops * nsecs) / 1000000LLU;
+	while (cnt-- > 0)
+		mb();
+}
+
 static struct sched_atom *
 get_new_event(struct task_desc *task, u64 timestamp)
 {
@@ -228,6 +391,8 @@ get_new_event(struct task_desc *task, u64 timestamp)
    event->timestamp = timestamp;
    event->nr = idx;
+	event->exited = task->exited;
+
    task->nr_events++;
    size = sizeof(struct sched_atom *) * task->nr_events;
    task->atoms = realloc(task->atoms, size);
@@ -249,7 +414,8 @@ static struct sched_atom *last_event(struct task_desc *task)
 static void
 add_sched_event_run(struct task_desc *task, u64 timestamp, u64 duration)
 {
-	struct sched_atom *event, *curr_event = last_event(task);
+	struct sched_atom *curr_event = last_event(task);
+	struct sched_atom *event;
/*
     * optimize an existing RUN event by merging this one
@@ -270,6 +436,18 @@ add_sched_event_run(struct task_desc *task, u64 timestamp, u64 duration)
 }
static void
+add_sched_event_exit(struct task_desc *task, u64 timestamp)
+{
+	struct sched_atom *event;
+
+	event = get_new_event(task, timestamp);
+
+	event->type = SCHED_EVENT_EXIT;
+
+	nr_exit_events++;
+}
+
+static int
 add_sched_event_wakeup(struct task_desc *task, u64 timestamp,
    	       struct task_desc *wakee)
 {
@@ -282,11 +460,26 @@ add_sched_event_wakeup(struct task_desc *task, u64 timestamp,
    wakee_event = last_event(wakee);
    if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
    	targetless_wakeups++;
-		return;
+		return 1;	/* targetless wakeup */
    }
+	event->wakee_event = wakee_event;
+
+	/* add to the waker array */
+	wakee_event->waker_count++;
+	wakee_event->wakers = realloc(wakee_event->wakers,
+			wakee_event->waker_count * sizeof(*wakee_event->wakers));
+	BUG_ON(wakee_event->wakers == NULL);
+	wakee_event->wakers[wakee_event->waker_count-1] = task;
+
+	/* add to the waker event array */
+	wakee_event->waker_events = realloc(wakee_event->waker_events,
+			wakee_event->waker_count * sizeof(*wakee_event->waker_events));
+	BUG_ON(wakee_event->waker_events == NULL);
+	wakee_event->waker_events[wakee_event->waker_count-1] = event;
+
    if (wakee_event->wait_sem) {
    	multitarget_wakeups++;
-		return;
+		return 2;	/* multi target wakup */
    }
wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
@@ -295,19 +488,51 @@ add_sched_event_wakeup(struct task_desc *task, u64 timestamp,
    event->wait_sem = wakee_event->wait_sem;
nr_wakeup_events++;
+
+	return 0;		/* normal wakeup with both waker & wakee */
 }
static void
 add_sched_event_sleep(struct task_desc *task, u64 timestamp,
-		      u64 task_state __used)
+		      u64 task_state)
 {
    struct sched_atom *event = get_new_event(task, timestamp);
event->type = SCHED_EVENT_SLEEP;
+	event->task_state = task_state;
+
+	event->wake_id = ++next_wake_id;
+	BUG_ON(next_wake_id == 0);	/* do not allow overflow */
nr_sleep_events++;
 }
+static void
+add_sched_event_fork_parent(struct task_desc *task, u64 timestamp, int child_pid)
+{
+	struct sched_atom *event;
+
+	event = get_new_event(task, timestamp);
+
+	event->type = SCHED_EVENT_FORK_PARENT;
+	event->pid = child_pid;
+
+	nr_fork_parent_events++;
+}
+
+static void
+add_sched_event_fork_child(struct task_desc *task, u64 timestamp, int parent_pid)
+{
+	struct sched_atom *event;
+
+	event = get_new_event(task, timestamp);
+
+	event->type = SCHED_EVENT_FORK_CHILD;
+	event->pid = parent_pid;
+
+	nr_fork_child_events++;
+}
+
 static struct task_desc *register_pid(unsigned long pid, const char *comm)
 {
    struct task_desc *task;
@@ -316,18 +541,29 @@ static struct task_desc *register_pid(unsigned long pid, const char *comm)
task = pid_to_task[pid];
-	if (task)
+	if (task) {
+		/* update task name if it was unknown before */
+		if (comm != NULL && strcmp(task->comm, "<unknown>") == 0) {
+			if (verbose)
+				printf("task #%lu with pid %ld updates name from '%s' to '%s'\n",
+						task->nr, task->pid, task->comm, comm);
+			strcpy(task->comm, comm);
+		}
    	return task;
+	}
task = zalloc(sizeof(*task));
    task->pid = pid;
    task->nr = nr_tasks;
-	strcpy(task->comm, comm);
+	strcpy(task->comm, comm ? comm : "<unknown>");
+
    /*
     * every task starts in sleeping state - this gets ignored
     * if there's no wakeup pointing to this sleep state:
+	 * NOTE: spr-replay get's awfully messy with this; removed.
     */
-	add_sched_event_sleep(task, 0, 0);
+	if (!spr_replay)
+		add_sched_event_sleep(task, 0, 0);
pid_to_task[pid] = task;
    nr_tasks++;
@@ -336,7 +572,7 @@ static struct task_desc *register_pid(unsigned long pid, const char *comm)
    tasks[task->nr] = task;
if (verbose)
-		printf("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm);
+		printf("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, task->comm);
return task;
 }
@@ -390,6 +626,11 @@ process_sched_event(struct task_desc *this_task __used, struct sched_atom *atom)
    		break;
    	case SCHED_EVENT_MIGRATION:
    		break;
+			/* unused */
+		case SCHED_EVENT_EXIT:
+		case SCHED_EVENT_FORK_PARENT:
+		case SCHED_EVENT_FORK_CHILD:
+			break;
    	default:
    		BUG_ON(1);
    }
@@ -709,6 +950,21 @@ struct trace_fork_event {
    u32 child_pid;
 };
+struct trace_exit_event {
+	u32 size;
+
+	u16 common_type;
+	u8 common_flags;
+	u8 common_preempt_count;
+	u32 common_pid;
+	u32 common_tgid;
+
+	u32 pid, ppid;
+	u32 tid, ptid;
+	u64 time;
+};
+
+
 struct trace_migrate_task_event {
    u32 size;
@@ -726,6 +982,9 @@ struct trace_migrate_task_event {
 };
struct trace_sched_handler {
+	int (*comm_event)(struct machine *machine,
+				u32 tid, const char *comm);
+
    void (*switch_event)(struct trace_switch_event *,
    		     struct machine *,
    		     struct event_format *,
@@ -759,18 +1018,80 @@ struct trace_sched_handler {
    		   int cpu,
    		   u64 timestamp,
    		   struct thread *thread);
+
+	void (*exit_event)(struct trace_exit_event *,
+			   struct machine *machine,
+			   struct event_format *,
+			   int cpu,
+			   u64 timestamp,
+			   struct thread *thread);
 };
+ __used static void dump_event(struct event_format *event)
+{
+	printf("dump of event=%p\n", event);
+	printf("  %-20s %s\n", "name", event->name);
+	printf("  %-20s %d\n", "id", event->id);
+	printf("  %-20s %d (%s|%s|%s|%s|%s|%s)\n", "flags", event->flags,
+		(event->flags & EVENT_FL_ISFTRACE) ? "ISFTRACE" : "",
+		(event->flags & EVENT_FL_ISPRINT) ? "ISPRINT" : "",
+		(event->flags & EVENT_FL_ISBPRINT) ? "ISBPRINT" : "",
+		(event->flags & EVENT_FL_ISFUNCENT) ? "ISFUNCENT" : "",
+		(event->flags & EVENT_FL_ISFUNCRET) ? "ISFUNCRET" : "",
+		(event->flags & EVENT_FL_FAILED) ? "FAILED" : "");
+	printf("  %-20s %s\n", "system", event->system);
+}
+
+__used static void dump_perf_sample(struct perf_sample *sample)
+{
+	printf("dump of perf_sample=%p\n", sample);
+	printf("  %-20s %" PRIu64 "\n", "ip", sample->ip);
+	printf("  %-20s %" PRIu32 "\n", "pid", sample->pid);
+	printf("  %-20s %" PRIu32 "\n", "tid", sample->tid);
+	printf("  %-20s %" PRIu64 "\n", "time", sample->time);
+	printf("  %-20s %" PRIu64 "\n", "addr", sample->addr);
+	printf("  %-20s %" PRIu64 "\n", "id", sample->id);
+	printf("  %-20s %" PRIu64 "\n", "stream_id", sample->stream_id);
+	printf("  %-20s %" PRIu64 "\n", "period", sample->period);
+	printf("  %-20s %" PRIu32 "\n", "cpu", sample->cpu);
+	printf("  %-20s %" PRIu32 "\n", "raw_size", sample->raw_size);
+	printf("  %-20s %p\n", "raw_data", sample->raw_data);
+}
+
+static u64 cpu_last_switched[MAX_CPUS];
+struct task_desc *cpu_last_task[MAX_CPUS];
+
+static int
+replay_comm_event(struct machine *machine __used, u32 tid, const char *comm)
+{
+	struct task_desc *task;
+
+	if (verbose) {
+		printf("replay_comm_event tid=%u comm=%s\n",
+				tid, comm);
+	}
+
+	task = register_pid(tid, comm);
+	if (task == NULL) {
+		dump_printf("problem processing replay_comm_event, skipping event.\n");
+		return -1;
+	}
+	return 0;
+}
static void
 replay_wakeup_event(struct trace_wakeup_event *wakeup_event,
    	    struct machine *machine __used,
    	    struct event_format *event,
    	    int cpu __used,
-		    u64 timestamp __used,
+		    u64 timestamp,
    	    struct thread *thread __used)
 {
    struct task_desc *waker, *wakee;
+	struct sched_atom *waker_event, *wakee_event;
+	char *s, *e;
+	char buf[BUFSIZ];
+	int ret;
if (verbose) {
    	printf("sched_wakeup event %p\n", event);
@@ -781,13 +1102,47 @@ replay_wakeup_event(struct trace_wakeup_event *wakeup_event,
    		wakeup_event->pid);
    }
-	waker = register_pid(wakeup_event->common_pid, "<unknown>");
+	waker = register_pid(wakeup_event->common_pid, NULL);
    wakee = register_pid(wakeup_event->pid, wakeup_event->comm);
-	add_sched_event_wakeup(waker, timestamp, wakee);
-}
+	ret = add_sched_event_wakeup(waker, timestamp, wakee);
-static u64 cpu_last_switched[MAX_CPUS];
+	waker_event = last_event(waker);
+	BUG_ON(waker_event == NULL);
+
+	wakee_event = last_event(wakee);
+
+	if (debug > 2) {
+		s = buf;
+		e = &buf[ARRAY_SIZE(buf)];
+		e[-1] = '\0';
+
+		snprintf(s, e - s, "wakee %d preempt=%u prio=%" PRIu32 " succ=%" PRIu32 "",
+				wakeup_event->pid,
+				(unsigned int)wakeup_event->common_preempt_count,
+				wakeup_event->prio, wakeup_event->cpu);
+		e[-1] = '\0';
+		s += strlen(s);
+
+		if (ret == 1) {
+			snprintf(s, e - s, " (targetless )");
+			e[-1] = '\0';
+			s += strlen(s);
+		} else if (ret == 2) {
+			snprintf(s, e - s, " (multitarget)");
+			e[-1] = '\0';
+			s += strlen(s);
+		}
+
+		if (wakee_event != NULL) {
+			snprintf(s, e - s, " (%s)", sched_atom_str(wakee_event));
+			e[-1] = '\0';
+			s += strlen(s);
+		}
+
+		waker_event->msg = strdup(buf);
+	}
+}
static void
 replay_switch_event(struct trace_switch_event *switch_event,
@@ -797,9 +1152,11 @@ replay_switch_event(struct trace_switch_event *switch_event,
    	    u64 timestamp,
    	    struct thread *thread __used)
 {
-	struct task_desc *prev, __used *next;
+	struct task_desc *prev, *next;
+	struct sched_atom *prev_atom;
    u64 timestamp0;
    s64 delta;
+	char buf[BUFSIZ];
if (verbose)
    	printf("sched_switch event %p\n", event);
@@ -827,9 +1184,20 @@ replay_switch_event(struct trace_switch_event *switch_event,
    next = register_pid(switch_event->next_pid, switch_event->next_comm);
cpu_last_switched[cpu] = timestamp;
+	cpu_last_task[cpu] = next;
add_sched_event_run(prev, timestamp, delta);
-	add_sched_event_sleep(prev, timestamp, switch_event->prev_state);
+	add_sched_event_sleep(prev, timestamp, switch_event->prev_state);	/* was adding +delta */
+	prev->task_state = switch_event->prev_state;
+	next->task_state = 0;	/* task running */
+
+	prev_atom = last_event(prev);
+	BUG_ON(prev_atom == NULL);
+
+	if (debug > 2) {
+		snprintf(buf, sizeof(buf), "switch to %d", switch_event->next_pid);
+		prev_atom->msg = strdup(buf);
+	}
 }
@@ -837,22 +1205,66 @@ static void
 replay_fork_event(struct trace_fork_event *fork_event,
    	  struct event_format *event,
    	  int cpu __used,
-		  u64 timestamp __used,
+		  u64 timestamp,
    	  struct thread *thread __used)
 {
+	struct task_desc *parent, *child;
+	struct sched_atom *atom;
+	char buf[BUFSIZ];
+
    if (verbose) {
    	printf("sched_fork event %p\n", event);
    	printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
    	printf("...  child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
    }
-	register_pid(fork_event->parent_pid, fork_event->parent_comm);
-	register_pid(fork_event->child_pid, fork_event->child_comm);
+
+	parent = register_pid(fork_event->parent_pid, fork_event->parent_comm);
+	child = register_pid(fork_event->child_pid, fork_event->child_comm);
+
+	add_sched_event_fork_parent(parent, timestamp, fork_event->child_pid);
+
+	atom = last_event(parent);
+	BUG_ON(atom == NULL);
+	atom->child = child;
+
+	if (debug > 2) {
+		snprintf(buf, sizeof(buf), "child %d", fork_event->child_pid);
+		atom->msg = strdup(buf);
+	}
+
+	add_sched_event_fork_child(child, timestamp, fork_event->parent_pid);
+
+	atom = last_event(child);
+	BUG_ON(atom == NULL);
+	atom->parent = parent;
+
+	if (debug > 2) {
+		snprintf(buf, sizeof(buf), "parent %d", fork_event->parent_pid);
+		atom->msg = strdup(buf);
+	}
+}
+
+static void
+replay_exit_event(struct trace_exit_event *exit_event __used,
+			struct machine *machine __used,
+			struct event_format *event __used,
+			int cpu __used,
+			u64 timestamp,
+			struct thread *thread __used)
+{
+	struct task_desc *task;
+
+	task = register_pid(exit_event->pid, NULL);	/* should never pick up unkown */
+	add_sched_event_exit(task, timestamp);
+	task->exited = 1;	/* mark that this task has exited */
 }
static struct trace_sched_handler replay_ops  = {
+	.comm_event		= replay_comm_event,
    .wakeup_event		= replay_wakeup_event,
    .switch_event		= replay_switch_event,
    .fork_event		= replay_fork_event,
+	.exit_event		= replay_exit_event,
 };
struct sort_dimension {
@@ -1559,8 +1971,23 @@ process_sched_exit_event(struct perf_tool *tool __used,
    		 struct machine *machine __used,
    		 struct thread *thread __used)
 {
+	void *data = sample->raw_data;
+	struct trace_exit_event exit_event;
+
    if (verbose)
    	printf("sched_exit event %p\n", event);
+
+	FILL_COMMON_FIELDS(exit_event, event, data);
+
+	FILL_FIELD(exit_event, pid, event, data);
+	FILL_FIELD(exit_event, ppid, event, data);
+	FILL_FIELD(exit_event, tid, event, data);
+	FILL_FIELD(exit_event, ptid, event, data);
+	FILL_FIELD(exit_event, time, event, data);
+
+	if (trace_handler->exit_event)
+		trace_handler->exit_event(&exit_event, machine, event,
+					  sample->cpu, sample->time, thread);
 }
static void
@@ -1620,11 +2047,26 @@ static int perf_sched__process_tracepoint_sample(struct perf_tool *tool,
    return 0;
 }
+static int perf_sched__process_comm(struct perf_tool *tool,
+				union perf_event *event,
+				struct perf_sample *sample,
+				struct machine *machine)
+{
+	if (verbose)
+		printf("\t*** %s:%d\n", event->comm.comm, event->comm.tid);
+
+	if (trace_handler->comm_event)
+		trace_handler->comm_event(machine, event->comm.tid, event->comm.comm);
+
+	return perf_event__process_comm(tool, event, sample, machine);
+}
+
 static struct perf_tool perf_sched = {
    .sample			= perf_sched__process_tracepoint_sample,
-	.comm			= perf_event__process_comm,
+	.comm			= perf_sched__process_comm,
    .lost			= perf_event__process_lost,
    .fork			= perf_event__process_task,
+	.exit			= perf_event__process_task,
    .ordered_samples	= true,
 };
@@ -1780,7 +2222,7 @@ static void __cmd_replay(void)
static const char * const sched_usage[] = {
-	"perf sched [<options>] {record|latency|map|replay|script}",
+	"perf sched [<options>] {record|latency|map|replay|script|spr-replay}",
    NULL
 };
@@ -1883,6 +2325,1868 @@ static int __cmd_record(int argc, const char **argv)
    return cmd_record(i, rec_argv, NULL);
 }
+/**************************************************************************/
+
+enum task_action_type {
+	TA_END,
+	TA_BURN,
+	TA_SLEEP,
+	TA_SPAWN,
+	TA_EXIT,
+	TA_CLONE_PARENT,
+	TA_CLONE_CHILD,
+	TA_WAIT_ID,
+	TA_SIGNAL_ID
+};
+
+struct task_action {
+	enum task_action_type action;
+	union {
+		struct { u64 nsecs; } burn;
+		struct { u64 nsecs; } sleep;
+		struct { u64 nsecs; } spawn;
+		struct { int ret; } exit;
+		struct { int child_pid;  } clone_parent;
+		struct { int parent_pid; } clone_child;
+		struct { unsigned long id; } wait_id;
+		struct { unsigned long id; } signal_id;
+	} u;
+};
+
+#define T(x)	((u64)((double)(x) * 1E9))
+
+struct task {
+	const char *name;
+	int pid;
+	const struct task_action *actions;
+};
+
+void *work_area;
+struct playback *playback;
+
+enum match_type {
+	MT_PID,
+	MT_NAME,
+	MT_REGEX,
+};
+
+struct match_list_entry {
+	struct list_head node;
+	enum match_type type;
+	int pid;
+	char *name;
+	regex_t regex;
+};
+
+LIST_HEAD(select_list);
+LIST_HEAD(exclude_list);
+
+static int match_task(struct list_head *lh, const char *name, int pid, int empty_list_val)
+{
+	int matched;
+	struct match_list_entry *mle;
+
+	if (list_empty(lh))
+		return empty_list_val;
+
+	list_for_each_entry(mle, lh, node) {
+		switch (mle->type) {
+		case MT_PID:
+			matched = mle->pid == pid;
+			break;
+		case MT_NAME:
+			matched = strcmp(mle->name, name) == 0;
+			break;
+		case MT_REGEX:
+			matched = regexec(&mle->regex, name, 0, NULL, 0) == 0;
+			break;
+		default:
+			matched = 0;
+			break;
+		}
+		if (matched)
+			return 1;
+	}
+	return 0;
+}
+
+struct task_run {
+	struct playback *playback;	/* point back to the playback */
+	const struct task *task;	/* task */
+	const struct task_action *current_action;
+	char name[BUFSIZ];		/* lots of space for a name */
+	int id;
+	pid_t real_pid;
+	pid_t parent_pid;		/* when created by CLONE_PARENT */
+	u64 start_time;		/* time when the current task starts */
+	u64 local_time;		/* local time (beginning from 0) */
+	int children_count;
+};
+
+struct playback_info {
+	int task_cnt;
+	int sem_cnt;
+	int task_off;
+	int sem_off;
+	int work_area_size;
+};
+
+struct playback {
+	struct playback_info info;
+
+	u64 origin_time;		/* time when the simulation starts */
+	int task_count;
+	struct task_run *task_runs;
+	unsigned long sem_count;
+	sem_t *semas;
+
+	unsigned int flags;		/* flags */
+#define PF_ABSOLUTE_TIME		1
+#define PF_RELATIVE_TIME		2
+#define PF_TIME_MASK			3
+
+	int debug_level;
+
+	int spawn_count;		/* left to spawn */
+	int exited_count;		/* left to exit */
+	int forked_count;		/* initially forked */
+};
+
+static inline int playback_absolute_time(const struct playback *p)
+{
+	return ((p->flags & PF_TIME_MASK) == PF_ABSOLUTE_TIME);
+}
+
+static inline int playback_relative_time(const struct playback *p)
+{
+	return ((p->flags & PF_TIME_MASK) == PF_RELATIVE_TIME);
+}
+
+static inline void playback_set_absolute_time(struct playback *p)
+{
+	p->flags = (p->flags & ~PF_TIME_MASK) | PF_ABSOLUTE_TIME;
+}
+
+static inline void playback_set_relative_time(struct playback *p)
+{
+	p->flags = (p->flags & ~PF_TIME_MASK) | PF_ABSOLUTE_TIME;
+}
+
+static inline void playback_set_debug_level(struct playback *p, int debug_level)
+{
+	p->debug_level = debug_level;
+}
+
+static const char *task_action_str(u64 local_ts, const struct task_action *ta);
+const struct task_action *task_action_current(struct task_run *tr);
+void task_action_advance(struct task_run *tr);
+
+void vtprintf(struct task_run *tr, const char *fmt, va_list ap);
+void tprintf(struct task_run *tr, const char *fmt, ...) __attribute__((__format__(__printf__,2,3)));
+
+void vtprintf(struct task_run *tr, const char *fmt, va_list ap)
+{
+	struct playback *p = tr->playback;
+	int n, size;
+	char *str = NULL;
+
+	if (p->debug_level <= 0)
+		return;
+
+	size = 256;	/* start with 80 chars (40 * 2) */
+	do {
+		size *= 2;
+		str = realloc(str, size);
+		BUG_ON(str == NULL);
+		/* Try to print in the allocated space. */
+		n = vsnprintf(str, size, fmt, ap);
+	} while (n <  0 || n >= size);
+
+	printf("#%2d [%16" PRIu64 " %16" PRIu64 "] %s", tr->id,
+			tr->local_time,
+			get_nsecs() - p->origin_time,
+			str);
+	free(str);
+}
+
+void tprintf(struct task_run *tr, const char *fmt, ...)
+{
+	struct playback *p;
+	va_list ap;
+
+	p = tr->playback;
+
+	if (p->debug_level <= 0)
+		return;
+
+	va_start(ap, fmt);
+	vtprintf(tr, fmt, ap);
+	va_end(ap);
+}
+
+
+const struct task_action *task_action_current(struct task_run *tr)
+{
+	if (tr->current_action->action == TA_END)
+		return NULL;
+	return tr->current_action;
+}
+
+void task_action_advance(struct task_run *tr)
+{
+	if (tr->current_action->action != TA_END)
+		tr->current_action++;
+}
+
+struct task_action_desc {
+	unsigned int id;
+	const char *name;
+	void (*execute)(struct task_run *tr, const struct task_action *ta);
+};
+
+static void execute_task(struct task_run *tr) __attribute__((__noreturn__));
+
+/* END is not processed */
+
+static void execute_spawn(struct task_run *tr __used, const struct task_action *ta __used)
+{
+	/* we do nothing */
+}
+
+static void execute_burn(struct task_run *tr, const struct task_action *ta)
+{
+	struct playback *p = tr->playback;
+	u64 t1, dt, target;
+
+	t1 = get_nsecs();
+	dt = ta->u.burn.nsecs;
+
+	if (playback_absolute_time(p))
+		target = p->origin_time + tr->local_time + dt;
+	else
+		target = t1 + dt;
+
+	if (target <= t1) {
+		tprintf(tr, "%s %" PRIu64 " - SKIPPED; slip of %" PRIu64 "ns\n",
+				"BURN", ta->u.burn.nsecs, t1 - target);
+		return;
+	}
+
+	tprintf(tr, "%s %" PRIu64 ":%" PRIi64 "\n", "BURN",
+			ta->u.burn.nsecs, (int64_t)(target - t1));
+
+	if (!bogoburn)
+		burn_nsecs(target - t1);
+	else
+		bogoburn_nsecs(target - t1);
+
+	tr->local_time += dt;
+}
+
+static void execute_sleep(struct task_run *tr, const struct task_action *ta)
+{
+	struct playback *p = tr->playback;
+	u64 t1, dt, target;
+
+	t1 = get_nsecs();
+	dt = ta->u.sleep.nsecs;
+
+	if (playback_absolute_time(p))
+		target = p->origin_time + tr->local_time + dt;
+	else
+		target = t1 + dt;
+
+	if (target <= t1) {
+		tprintf(tr, "%s %" PRIu64 " - SKIPPED; slip of %" PRIu64 "ns\n",
+				"SLEEP", ta->u.sleep.nsecs, t1 - target);
+		return;
+	}
+
+	tprintf(tr, "%s %" PRIu64 ":%" PRIi64 "\n", "SLEEP",
+			ta->u.burn.nsecs, (int64_t)(target - t1));
+
+	sleep_nsecs(target - t1);	/* bogomips */
+
+	tr->local_time += dt;
+}
+
+static void execute_exit(struct task_run *tr, const struct task_action *ta)
+{
+	struct playback *p = tr->playback;
+	struct task_run *tr1;
+	pid_t real_pid;
+	int i, status;
+
+	tprintf(tr, "%s\n", "EXIT");
+
+	while (tr->children_count > 0) {
+		real_pid = wait(&status);
+		if (real_pid < 0) {
+			fprintf(stderr, "panto-perf: Wait failed: %s:%d\n", __func__, __LINE__);
+			exit(EXIT_FAILURE);
+		}
+		if (!WIFEXITED(status))
+			continue;
+
+		/* locate child */
+		for (i = 0, tr1 = p->task_runs; i < p->task_count; i++, tr1++)
+			if (real_pid == tr1->real_pid)
+				break;
+		assert(i < p->task_count);
+
+		tr->children_count--;
+	}
+	exit(ta->u.exit.ret);
+}
+
+static void execute_clone_parent(struct task_run *tr, const struct task_action *ta)
+{
+	struct playback *p = tr->playback;
+	struct task_run *tr1;
+	int i, pid;
+	pid_t real_pid;
+
+	pid = ta->u.clone_parent.child_pid;
+
+	tprintf(tr, "%s %d\n", "CLONE_PARENT", ta->u.clone_parent.child_pid);
+
+	/* locate child */
+	for (i = 0, tr1 = p->task_runs; i < p->task_count; i++, tr1++)
+		if (pid == tr1->task->pid)
+			break;
+	assert(i < p->task_count);
+
+	tr1->parent_pid = getpid();
+
+	/* all clones are for now forks */
+	real_pid = fork();
+	assert(real_pid >= 0);
+
+	if (real_pid == 0) {	/* child */
+		tr1->real_pid = getpid();
+		tr1->start_time = tr->start_time + tr->local_time;
+		tr1->local_time = tr->local_time;
+		execute_task(tr1);
+	}
+	tr1->real_pid = real_pid;
+	tr->children_count++;
+}
+
+static void execute_clone_child(struct task_run *tr, const struct task_action *ta)
+{
+	struct playback *p = tr->playback;
+
+	(void)p;
+
+	/* nothing */
+	tprintf(tr, "%s %d\n", "CLONE_CHILD", ta->u.clone_child.parent_pid);
+}
+
+static void execute_wait_id(struct task_run *tr, const struct task_action *ta)
+{
+	struct playback *p = tr->playback;
+	sem_t *sem_ptr;
+	u64 t1, dt;
+	int ret;
+
+	t1 = get_nsecs();
+
+	tprintf(tr, "%s %lu\n", "WAIT_ID", ta->u.wait_id.id);
+
+	/* wait for someone to wake us up */
+	assert(ta->u.wait_id.id < p->sem_count);
+	sem_ptr = p->semas + ta->u.wait_id.id;
+	do {
+		ret = sem_wait(sem_ptr);
+	} while (ret == -1 && (errno == EINTR || errno == EAGAIN));
+	if (ret != 0) {
+		printf("sem_wait failed (%s wait_id=%lu) = %d errno=%d(%s)\n", tr->name,
+				ta->u.wait_id.id, ret, errno, strerror(errno));
+		assert(0);
+	}
+
+	dt = get_nsecs() - t1;
+
+	tr->local_time += dt;
+}
+
+static void execute_signal_id(struct task_run *tr, const struct task_action *ta)
+{
+	struct playback *p = tr->playback;
+	sem_t *sem_ptr;
+	int ret;
+
+	tprintf(tr, "%s %lu\n", "SIGNAL_ID", ta->u.signal_id.id);
+
+	/* signal wake up for the other thread */
+	assert(ta->u.signal_id.id < p->sem_count);
+	sem_ptr = p->semas + ta->u.signal_id.id;
+	do {
+		ret = sem_post(sem_ptr);
+	} while (ret == -1 && (errno == EINTR || errno == EAGAIN));
+	if (ret == -1) {
+		printf("sem_post failed (%s signal_id=%lu) = %d errno=%d(%s)\n", tr->name,
+				ta->u.signal_id.id, ret, errno, strerror(errno));
+		assert(0);
+	}
+
+	/* local time does not advance */
+}
+
+static const struct task_action_desc action_table[] = {
+	[TA_END] = {
+		.id		= TA_END,
+		.name		= "END",
+	},
+	[TA_SPAWN] = {
+		.id		= TA_SPAWN,
+		.name		= "SPAWN",
+		.execute	= execute_spawn,
+	},
+	[TA_BURN] = {
+		.id		= TA_BURN,
+		.name		= "BURN",
+		.execute	= execute_burn,
+	},
+	[TA_SLEEP] = {
+		.id		= TA_SLEEP,
+		.name		= "SLEEP",
+		.execute	= execute_sleep,
+	},
+	[TA_EXIT] = {
+		.id		= TA_EXIT,
+		.name		= "EXIT",
+		.execute	= execute_exit,
+	},
+	[TA_CLONE_PARENT] = {
+		.id		= TA_CLONE_PARENT,
+		.name		= "CLONE_PARENT",
+		.execute	= execute_clone_parent,
+	},
+	[TA_CLONE_CHILD] = {
+		.id		= TA_CLONE_CHILD,
+		.name		= "CLONE_CHILD",
+		.execute	= execute_clone_child,
+	},
+	[TA_WAIT_ID] = {
+		.id		= TA_WAIT_ID,
+		.name		= "WAIT_ID",
+		.execute	= execute_wait_id,
+	},
+	[TA_SIGNAL_ID] = {
+		.id		= TA_SIGNAL_ID,
+		.name		= "SIGNAL_ID",
+		.execute	= execute_signal_id,
+	},
+};
+
+static void execute_task(struct task_run *tr)
+{
+	const struct task *t;
+	const struct task_action *ta;
+	struct playback *p;
+
+	t = tr->task;
+	assert(t != NULL);
+
+	p = tr->playback;
+	assert(p != NULL);
+
+	/* copy name from the task and inform */
+	snprintf(tr->name, sizeof(tr->name) - 1, ":%s-%d", t->name, t->pid);
+	tr->name[sizeof(tr->name) - 1] = '\0';
+	prctl(PR_SET_NAME, tr->name);
+
+	/* skip over spawn */
+	while ((ta = task_action_current(tr)) != NULL && ta->action == TA_SPAWN) {
+		tr->local_time += ta->u.spawn.nsecs;
+		task_action_advance(tr);
+	}
+
+	tprintf(tr, "start: name %s, pid %d\n", t->name, t->pid);
+
+	while ((ta = task_action_current(tr)) != NULL) {
+		/* verify size */
+		assert((unsigned int)ta->action < ARRAY_SIZE(action_table));
+		(*action_table[ta->action].execute)(tr, ta);
+
+		task_action_advance(tr);
+	}
+
+	exit(0);
+}
+
+static void dump_task(const struct task *t)
+{
+	const struct task_action *ta;
+	u64 ts;
+
+	printf("TASK: name %s, pid %d\n", t->name, t->pid);
+
+	ts = 0;
+	for (ta = t->actions; ; ta++) {
+
+		if (ta->action == TA_SPAWN)
+			ts = ta->u.spawn.nsecs;
+
+		printf("%s\n", task_action_str(ts, ta));
+		if (ta->action == TA_END)
+			break;
+
+		if (ta->action == TA_BURN)
+			ts += ta->u.burn.nsecs;
+		else if (ta->action == TA_SLEEP)
+			ts += ta->u.sleep.nsecs;
+	}
+}
+
+static void generate_task(const struct task *t)
+{
+	const struct task_action *ta;
+
+	printf("[%s/%d]\n", t->name, t->pid);
+
+	for (ta = t->actions; ; ta++) {
+
+		switch (ta->action) {
+			case TA_BURN:
+				printf("\tburn %" PRIu64 "\n", ta->u.burn.nsecs);
+				break;
+			case TA_SLEEP:
+				printf("\tsleep %" PRIu64 "\n", ta->u.sleep.nsecs);
+				break;
+			case TA_SPAWN:
+				printf("\tspawn %" PRIu64 "\n", ta->u.spawn.nsecs);
+				break;
+			case TA_CLONE_PARENT:
+				printf("\tclone-parent %d\n", ta->u.clone_parent.child_pid);
+				break;
+			case TA_CLONE_CHILD:
+				printf("\tclone-child %d\n", ta->u.clone_child.parent_pid);
+				break;
+			case TA_WAIT_ID:
+				printf("\twait-id %lu\n", ta->u.wait_id.id);
+				break;
+			case TA_SIGNAL_ID:
+				printf("\tsignal-id %lu\n", ta->u.signal_id.id);
+				break;
+			case TA_EXIT:
+				printf("\texit %d\n", ta->u.exit.ret);
+				break;
+			case TA_END:
+				printf("\tend\n");
+			default:
+				break;
+		}
+
+		if (ta->action == TA_END)
+			break;
+	}
+}
+
+static void fill_playback_info(struct playback_info *pi, const struct task * const * tasks_array)
+{
+	const struct task *t;
+	const struct task * const *tt;
+	const struct task_action *ta;
+	unsigned long id, maxid;
+	int pagesize;
+
+	/* get page size */
+	pagesize = getpagesize();
+
+	/* make sure it's a power of two */
+	assert((pagesize & (pagesize - 1)) == 0);
+
+	/* align work area to 16 bytes */
+	pi->work_area_size = ALIGN(sizeof(struct playback), 16);
+	pi->task_off = pi->work_area_size;
+
+	/* count number of tasks (and find maximum futex id) */
+	maxid = -1;
+	pi->task_cnt = 0;
+	tt = tasks_array;
+	while ((t = *tt++) != NULL) {
+		pi->task_cnt++;
+		for (ta = t->actions; ta != NULL && ta->action != TA_END; ta++) {
+			if (ta->action == TA_WAIT_ID)
+				id = ta->u.wait_id.id;
+			else if (ta->action == TA_SIGNAL_ID)
+				id = ta->u.signal_id.id;
+			else
+				continue;
+
+			if (id > maxid || maxid == (unsigned long)-1)
+				maxid = id;
+		}
+	}
+	pi->sem_cnt = maxid + 1;
+
+	/* add the size of the task structs */
+	pi->work_area_size += ALIGN(sizeof(struct task_run) * pi->task_cnt, 16);
+	pi->sem_off = pi->work_area_size;
+
+	/* add the size of the semas */
+	pi->work_area_size += ALIGN(sizeof(sem_t) * pi->sem_cnt, 16);
+
+	/* now align to the pagesize (we have verified that pagesize is a power of two) */
+	pi->work_area_size = ALIGN(pi->work_area_size, pagesize);
+
+	printf("#%d tasks, #%d sems, total work area size %d\n",
+			pi->task_cnt, pi->sem_cnt, pi->work_area_size);
+
+}
+
+static struct playback *playback_create(const struct task * const *tasks_array)
+{
+	struct playback_info pi;
+	const struct task * const *tt;
+	struct task_run *tr;
+	int i;
+	unsigned long j;
+	struct playback *p;
+
+	assert(tasks_array != NULL);
+
+	/* layout playback structure according to inputs */
+	fill_playback_info(&pi, tasks_array);
+
+	/* map the shared memory across all processes */
+	work_area = mmap(NULL, pi.work_area_size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0);
+	assert(work_area != MAP_FAILED);
+
+	memset(work_area, 0, pi.work_area_size);
+
+	playback = work_area;
+
+	p = playback;
+	memcpy(&p->info, &pi, sizeof(pi));
+
+	p->origin_time = 0;
+	p->task_count = pi.task_cnt;
+	p->task_runs = work_area + pi.task_off;
+	p->sem_count = pi.sem_cnt;
+	p->semas = work_area + pi.sem_off;
+	for (j = 0; j < p->sem_count; j++)
+		sem_init(&p->semas[j], 1, 0);
+
+	p->spawn_count = 0;
+	p->exited_count = 0;
+	p->forked_count = 0;
+
+	for (i = 0, tr = p->task_runs, tt = tasks_array; i < p->task_count;
+			i++, tr++, tt++) {
+
+		tr->playback = p;
+		tr->task = *tt;
+		tr->current_action = tr->task->actions;
+		tr->id = i;
+		tr->real_pid = -1;
+		tr->start_time = 0;
+		tr->local_time = 0;
+	}
+
+	return p;
+}
+
+static void playback_destroy(struct playback *p)
+{
+	assert(p != NULL);
+
+	munmap(p, p->info.work_area_size);
+}
+
+static void playback_run(struct playback *p)
+{
+	struct task_run *tr, *tr1;
+	const struct task *t;
+	const struct task_action *ta;
+	int i, status;
+	pid_t real_pid;
+	uint64_t curr_nsecs;
+
+	p->origin_time = get_nsecs();
+
+	p->spawn_count = 0;
+	for (i = 0; i < p->task_count; i++) {
+		tr = &p->task_runs[i];
+		t = tr->task;
+		if (t->actions->action == TA_SPAWN) {
+			p->spawn_count++;
+			continue;
+		}
+
+		/* don't bother with children */
+		if (t->actions->action == TA_CLONE_CHILD)
+			continue;
+
+		real_pid = fork();
+		assert(real_pid >= 0);
+
+		if (real_pid == 0) {
+			/* fill up for the child */
+			tr->start_time = get_nsecs();
+			tr->real_pid = getpid();
+			execute_task(tr);
+		}
+		tr->real_pid = real_pid;
+
+		p->forked_count++;
+	}
+
+	curr_nsecs = 0;
+	while (p->spawn_count > 0) {
+		tr1 = NULL;
+		for (i = 0; i < p->task_count; i++) {
+			tr = &p->task_runs[i];
+			t = tr->task;
+			ta = t->actions;
+
+			if (tr->real_pid >= 0 || ta->action != TA_SPAWN)
+				continue;
+
+			if (tr1 == NULL || tr1->task->actions->u.spawn.nsecs > ta->u.spawn.nsecs)
+				tr1 = tr;
+		}
+		assert(tr1 != NULL);
+
+		tr = tr1;
+		ta = t->actions;
+		sleep_nsecs(ta->u.spawn.nsecs - curr_nsecs);
+
+		real_pid = fork();
+		assert(real_pid >= 0);
+
+		if (real_pid == 0) {
+			/* fill up for the child */
+			tr->start_time = get_nsecs();
+			tr->real_pid = getpid();
+			execute_task(tr);
+		}
+		tr->real_pid = real_pid;
+
+		curr_nsecs += (ta->u.spawn.nsecs - curr_nsecs);
+		p->spawn_count--;
+		p->forked_count++;
+	}
+
+	while (p->forked_count > 0) {
+		real_pid = wait(&status);
+		if (real_pid < 0) {
+			fprintf(stderr, "panto-perf: Wait failed: %s:%d\n", __func__, __LINE__);
+			exit(EXIT_FAILURE);
+		}
+		if (!WIFEXITED(status))
+			continue;
+
+		for (i = 0; i < p->task_count; i++) {
+			tr = &p->task_runs[i];
+			t = tr->task;
+			if (tr->real_pid == real_pid)
+				break;
+		}
+
+		assert(i < p->task_count);
+		tr->real_pid = -1;
+		p->forked_count--;
+	}
+}
+
+static const char * const spr_replay_usage[] = {
+	"perf sched spr-replay [<options>]",
+	NULL
+};
+
+static int parse_match_option(const struct option *opt, const char *str,
+			int unset __used)
+{
+	struct list_head *lh = (struct list_head *)opt->value;
+	struct match_list_entry *mle = NULL;
+	const char *s;
+	int ret;
+
+	mle = zalloc(sizeof(*mle));
+	if (mle == NULL)
+		goto error;
+
+	INIT_LIST_HEAD(&mle->node);
+	mle->pid = -1;
+
+	/* first check if everything is a digit (pid) */
+	for (s = str; *s != '\0'; s++)
+		if (!isdigit(*s))
+			break;
+	if (*s == '\0') {
+		mle->type = MT_PID;
+		mle->pid = atoi(str);
+		goto add;
+	}
+
+	/* name is either a name, or a pattern */
+	if (*str != '~') {
+		mle->type = MT_NAME;
+		mle->name = strdup(str);
+		if (mle->name == NULL)
+			goto error;
+		goto add;
+	}
+
+	/* it is a pattern after all of the form '~regex' */
+	ret = regcomp(&mle->regex, str + 1, REG_EXTENDED);
+	if (ret)
+		goto error;
+	mle->type = MT_REGEX;
+add:
+	list_add_tail(&mle->node, lh);
+	return 0;
+
+error:
+	free(mle);	/* NULL is properly handled */
+	return -1;
+}
+
+static const struct option spr_replay_options[] = {
+	OPT_CALLBACK('s', "select", &select_list, "select",
+		     "Number pid, 'name | ~regex' matches comm name",
+		     parse_match_option),
+	OPT_CALLBACK('x', "exclude", &exclude_list, "exclude",
+		     "Number pid, 'name | ~regex' matches comm name",
+		     parse_match_option),
+	OPT_UINTEGER('r', "repeat", &replay_repeat,
+		     "repeat the workload replay N times (-1: infinite)"),
+	OPT_INCR('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
+	OPT_BOOLEAN('p', "preserve-time", &preserve_time,
+		    "Preserve time (do not erase sleeps while in running state)"),
+	OPT_BOOLEAN('n', "dry-run", &dry_run,
+		    "do not execute"),
+	OPT_INCR('d', "debug", &debug,
+		    "be more verbose"),
+	OPT_BOOLEAN('g', "generate", &generate,
+		    "generate an spr program at stdout"),
+	OPT_STRING('f', "spr-filename", &spr_filename, "file",
+		    "spr file name (instead of trace)"),
+	OPT_BOOLEAN('l', "list", &spr_list,
+		    "list tasks & pids"),
+	OPT_BOOLEAN('L', "list-selected", &spr_select_list,
+		    "list selected tasks & pids"),
+	OPT_BOOLEAN('b', "bogoburn", &bogoburn,
+		    "burn time using a bogo-mips like busy loop"),
+	OPT_U64('B', "bogoloops", &bogoloops,
+		    "set bogoloops value directly without re-calculating"),
+	OPT_END()
+};
+
+static const char *task_state_str(unsigned int task_state)
+{
+	static const char * const task_state_array[] = {
+		"R (running)",		/*   0 */
+		"S (sleeping)",		/*   1 */
+		"D (disk sleep)",	/*   2 */
+		"T (stopped)",		/*   4 */
+		"t (tracing stop)",	/*   8 */
+		"Z (zombie)",		/*  16 */
+		"X (dead)",		/*  32 */
+		"x (dead)",		/*  64 */
+		"K (wakekill)",		/* 128 */
+		"W (waking)",		/* 256 */
+	};
+	const char * const *p = &task_state_array[0];
+	unsigned int state = task_state & 511;
+
+	while (state) {
+		p++;
+		state >>= 1;
+	}
+	return *p;
+}
+
+static void calculate_replay_start_time(void)
+{
+	struct sched_atom *atom;
+	struct task_desc *task;
+	unsigned long i;
+
+	replay_start_time = (u64)-1;	/* maximum possible */
+	for (i = 0; i < nr_tasks; i++) {
+		task = tasks[i];
+
+		if (task->nr_events == 0)
+			continue;
+
+		atom = task->atoms[0];
+
+		if (atom->timestamp < replay_start_time)
+			replay_start_time = atom->timestamp;
+
+	}
+	if (replay_start_time == (u64)-1)
+		replay_start_time = 0;
+}
+
+static void calculate_replay_end_time(void)
+{
+	struct sched_atom *atom;
+	struct task_desc *task;
+	unsigned long i;
+	u64 ts_end;
+
+	replay_end_time = 0;	/* minimum */
+	for (i = 0; i < nr_tasks; i++) {
+		task = tasks[i];
+
+		if (task->nr_events == 0)
+			continue;
+
+		atom = task->atoms[task->nr_events - 1];
+
+		ts_end = atom->timestamp;
+		if (atom->type == SCHED_EVENT_RUN)
+			ts_end += atom->duration;
+
+		if (ts_end > replay_end_time)
+			replay_end_time = ts_end;
+	}
+}
+
+static const char *sched_atom_str(const struct sched_atom *atom)
+{
+	static char buf[BUFSIZ];
+	char *s, *e;
+	unsigned int i;
+
+	if (replay_start_time == 0)
+		calculate_replay_start_time();
+
+	if (replay_end_time == 0)
+		calculate_replay_end_time();
+
+	s = buf;
+	e = &buf[ARRAY_SIZE(buf)];
+
+	snprintf(s, e - s, "\t%014" PRIu64 " ", atom->timestamp - replay_start_time);
+	e[-1] = '\0';
+	s += strlen(s);
+
+	switch (atom->type) {
+		case SCHED_EVENT_RUN:
+			snprintf(s, e - s, "%-10s%014" PRIu64, "RUN",
+				atom->duration);
+			e[-1] = '\0';
+			s += strlen(s);
+			break;
+		case SCHED_EVENT_SLEEP:
+			snprintf(s, e - s, "%-10s %s %d", "SLEEP",
+					task_state_str(atom->task_state),
+					atom->waker_count);
+			e[-1] = '\0';
+			s += strlen(s);
+			for (i = 0; i < atom->waker_count; i++) {
+				snprintf(s, e - s, " [%ld@%014" PRIu64 "]",
+					atom->wakers[i]->pid,
+					atom->waker_events[i]->timestamp - replay_start_time);
+				e[-1] = '\0';
+				s += strlen(s);
+			}
+			break;
+		case SCHED_EVENT_WAKEUP:
+			snprintf(s, e - s, "%-10s", "WAKEUP");
+			e[-1] = '\0';
+			s += strlen(s);
+			if (atom->wakee != NULL && atom->wakee_event != NULL) {
+				snprintf(s, e - s, " [%ld@%014" PRIu64 "]",
+					atom->wakee->pid,
+					atom->wakee_event->timestamp - replay_start_time);
+				e[-1] = '\0';
+				s += strlen(s);
+			}
+			break;
+		case SCHED_EVENT_MIGRATION:
+			snprintf(s, e - s, "%-10s", "MIGRATION");
+			e[-1] = '\0';
+			s += strlen(s);
+			break;
+		case SCHED_EVENT_EXIT:
+			snprintf(s, e - s, "%-10s", "EXIT");
+			e[-1] = '\0';
+			s += strlen(s);
+			break;
+		case SCHED_EVENT_FORK_PARENT:
+			snprintf(s, e - s, "%-10s %d", "FORK_PARENT", atom->pid);
+			e[-1] = '\0';
+			s += strlen(s);
+			break;
+		case SCHED_EVENT_FORK_CHILD:
+			snprintf(s, e - s, "%-10s %d", "FORK_CHILD", atom->pid);
+			e[-1] = '\0';
+			s += strlen(s);
+			break;
+		default:
+			/* GCC whines about missing default case without this one */
+			break;
+	}
+
+	if (debug > 2 && atom->msg) {
+		snprintf(s, e - s, " | %s", atom->msg);
+		e[-1] = '\0';
+		s += strlen(s);
+	}
+
+	return buf;
+}
+
+static void dump_sched_atom_task(struct task_desc *task)
+{
+	struct sched_atom *atom;
+	unsigned long j;
+	unsigned int exited;
+
+	exited = 0;
+	for (j = 0; j < task->nr_events && !exited; j++) {
+
+		atom = task->atoms[j];
+
+		printf("%s\n", sched_atom_str(atom));
+
+		/* exited = atom->type == SCHED_EVENT_EXIT; */
+	}
+}
+
+static const char *task_action_str(u64 local_ts, const struct task_action *ta)
+{
+	static char buf[BUFSIZ];
+	char *s, *e;
+
+	s = buf;
+	e = &buf[ARRAY_SIZE(buf)];
+
+	snprintf(s, e - s, "[%014" PRIu64 "] %-12s", local_ts, action_table[ta->action].name);
+	e[-1] = '\0';
+	s += strlen(s);
+
+	switch (ta->action) {
+		case TA_BURN:
+			snprintf(s, e - s, "%014" PRIu64 "ns", ta->u.burn.nsecs);
+			break;
+		case TA_SLEEP:
+			snprintf(s, e - s, "%014" PRIu64 "ns", ta->u.sleep.nsecs);
+			break;
+		case TA_SPAWN:
+			snprintf(s, e - s, "%014" PRIu64 "ns", ta->u.spawn.nsecs);
+			break;
+		case TA_CLONE_PARENT:
+			snprintf(s, e - s, "%d", ta->u.clone_parent.child_pid);
+			break;
+		case TA_CLONE_CHILD:
+			snprintf(s, e - s, "%d", ta->u.clone_child.parent_pid);
+			break;
+		case TA_WAIT_ID:
+			snprintf(s, e - s, "%lu", ta->u.wait_id.id);
+			break;
+		case TA_SIGNAL_ID:
+			snprintf(s, e - s, "%lu", ta->u.signal_id.id);
+			break;
+		case TA_EXIT:
+		case TA_END:
+			*s = '\0';
+			break;
+		default:
+			/* GCC is whining about missing default case */
+			break;
+	}
+	e[-1] = '\0';
+	s += strlen(s);
+
+	return buf;
+}
+
+static struct task *generate_spr_program(struct task_desc *task, unsigned long *map_id_fwd, unsigned long *next_opt_id)
+{
+	struct sched_atom *atom, *atom_last;
+	unsigned long eventnr;
+	unsigned int exited;
+	unsigned int blocked;
+	unsigned int last_blocked;
+	unsigned int last_was_runnable;
+	unsigned int selected_wakers;
+	unsigned int exited_wakers;
+	struct task *t;
+	struct task_action *ta_alloc;
+	int ta_count, ta_max;
+	int task_state;
+	u64 replay_ts, replay_ts_new;
+	u64 local_ts;
+	u64 delta_ts;
+	u64 start_run_ts;
+	int pending_count;
+	struct task_action ta_work;
+	unsigned int i;
+	u64 burn_run_accum;
+
+	BUG_ON(task == NULL);
+
+	BUG_ON(task->nr_events == 0);
+
+	if (replay_start_time == 0)
+		calculate_replay_start_time();
+
+	if (replay_end_time == 0)
+		calculate_replay_end_time();
+
+	t = zalloc(sizeof(*t));
+	BUG_ON(t == NULL);
+	t->pid = task->pid;
+	t->name = strdup(task->comm);
+	BUG_ON(t->name == NULL);
+
+	ta_count = 0;
+	ta_max = 256;	/* starting point */
+	ta_alloc = zalloc(sizeof(*ta_alloc) * ta_max);
+	BUG_ON(ta_alloc == NULL);
+	t->actions = ta_alloc;
+
+#undef APPEND_TA_WORK
+#define APPEND_TA_WORK() \
+	do { \
+		if (ta_count >= ta_max) { \
+			ta_alloc = realloc(ta_alloc, ta_max * 2 * (sizeof(*t->actions))); \
+			BUG_ON(ta_alloc == NULL); \
+			ta_max *= 2; \
+			t->actions = ta_alloc; \
+		} \
+		ta_alloc[ta_count++] = ta_work; \
+		if (debug > 2) \
+			printf("\t\t%4d: GEN: %4d: [%012" PRIu64 "] %s\n", __LINE__, \
+				pending_count, start_run_ts, \
+					task_action_str(start_run_ts, &ta_work)); \
+	} while (0)
+
+#undef GET_OPTIMIZED_ID
+#define GET_OPTIMIZED_ID(x) \
+	({ \
+		unsigned long _x = (x); \
+		BUG_ON(_x >= (next_wake_id + 1)); \
+		if (map_id_fwd[_x] == 0) { \
+			map_id_fwd[_x] = ++(*next_opt_id); \
+			BUG_ON(*next_opt_id == 0); \
+		} \
+		map_id_fwd[_x]; \
+	})
+
+	exited = 0;
+	replay_ts = 0;
+	local_ts = 0;
+	task_state = -1;
+	atom  = NULL;
+	start_run_ts = 0;
+	pending_count = 0;
+	last_was_runnable = 0;
+	last_blocked = 0;
+	burn_run_accum = 0;
+
+	memset(&ta_work, 0, sizeof(ta_work));
+	ta_work.action = TA_END;
+
+	for (eventnr = 0; eventnr < task->nr_events && !exited; eventnr++) {
+
+		atom_last = atom;
+
+		atom = task->atoms[eventnr];
+
+		exited = atom->type == SCHED_EVENT_EXIT;
+
+		blocked = 0;
+		selected_wakers = 0;
+		exited_wakers = 0;
+
+		if (atom->type == SCHED_EVENT_SLEEP) {
+
+			/* by default we're blocked */
+			blocked = 1;
+
+			/* but if we don't preserve time and we're running... */
+			if (!preserve_time && (atom->task_state & 511) == 0) {
+				/* check for exited wakers */
+				selected_wakers = 0;
+				exited_wakers = 0;
+				for (i = 0; i < atom->waker_count; i++) {
+					if (atom->wakers[i]->selected) {
+						selected_wakers++;
+						if (atom->waker_events[i]->exited)
+							exited_wakers++;
+					}
+				}
+				blocked = selected_wakers > 0 && selected_wakers > exited_wakers;
+			}
+		}
+
+		replay_ts_new = atom->timestamp - replay_start_time;
+
+		/* first one */
+		if (eventnr == 0)
+			replay_ts = replay_ts_new;
+
+		if (last_was_runnable && !preserve_time)	/* we don't count the time we were runnable */
+			delta_ts = 0;
+		else
+			delta_ts = replay_ts_new - replay_ts;
+		last_was_runnable = 0;
+		replay_ts = replay_ts_new;
+
+		if (debug > 2)
+			printf("[%012" PRIu64 " %012" PRIu64 "] %s\n",
+				replay_ts, delta_ts, sched_atom_str(atom));
+
+		/* first one is special */
+		if (eventnr == 0) {
+			if (atom->type == SCHED_EVENT_EXIT) {
+				ta_work.action = TA_SLEEP;
+				ta_work.u.sleep.nsecs = 0;
+				start_run_ts = 0;
+				pending_count = 1;
+			} else if (atom->type != SCHED_EVENT_FORK_CHILD) {
+				ta_work.action = TA_BURN;
+				ta_work.u.burn.nsecs = replay_ts;
+				start_run_ts = 0;
+				pending_count = 1;
+				burn_run_accum = 0;
+			}
+		}
+
+		if (eventnr > 0 && pending_count > 0 && blocked != last_blocked) {
+
+			BUG_ON(ta_work.action != TA_BURN && ta_work.action != TA_SLEEP);
+
+			if (ta_work.action == TA_BURN) {
+				ta_work.u.burn.nsecs += burn_run_accum + delta_ts;
+				if (ta_work.u.burn.nsecs > 0)
+					APPEND_TA_WORK();
+			} else if (ta_work.action == TA_SLEEP) {
+				ta_work.u.sleep.nsecs += delta_ts;
+				if (ta_work.u.sleep.nsecs > 0)
+					APPEND_TA_WORK();
+			}
+			start_run_ts = local_ts;
+			pending_count = 0;
+		}
+
+		switch (atom->type) {
+			case SCHED_EVENT_RUN:
+				task_state = 0;
+				if (pending_count == 0) {
+					ta_work.action = TA_BURN;
+					if (atom->duration < delta_ts)
+						start_run_ts = local_ts - atom->duration;
+					else
+						start_run_ts = local_ts - delta_ts;
+				} else {
+					BUG_ON(ta_work.action != TA_BURN);
+				}
+				ta_work.u.burn.nsecs = 0;	/* always reset */
+				burn_run_accum += atom->duration;
+				pending_count++;
+				break;
+
+			case SCHED_EVENT_SLEEP:
+				task_state = atom->task_state & 511;
+
+				/* set when we were running, and switched out */
+				last_was_runnable = task_state == 0;
+
+				if (pending_count > 0) {
+					BUG_ON(ta_work.action != TA_BURN);
+					ta_work.u.burn.nsecs += delta_ts;
+					if (blocked) {
+						ta_work.u.burn.nsecs += burn_run_accum;
+						if (ta_work.u.burn.nsecs > 0)
+							APPEND_TA_WORK();
+						pending_count = 0;
+						burn_run_accum = 0;
+					}
+				}
+
+				/* if we're not blocked, we skip if we're still running */
+				if (!blocked)
+					break;
+
+				if (selected_wakers > 0 && selected_wakers > exited_wakers) {
+					ta_work.action = TA_WAIT_ID;
+					ta_work.u.wait_id.id = GET_OPTIMIZED_ID(atom->wake_id);
+					start_run_ts = local_ts;
+					APPEND_TA_WORK();
+					pending_count = 0;
+				} else {
+					ta_work.action = TA_SLEEP;
+					ta_work.u.sleep.nsecs = 0;
+					start_run_ts = local_ts;
+					pending_count = 1;
+				}
+
+				break;
+
+			case SCHED_EVENT_WAKEUP:
+				/* we were blocked? */
+				if (pending_count > 0 && last_blocked) {
+					BUG_ON(ta_work.action != TA_SLEEP);
+					ta_work.u.sleep.nsecs += delta_ts;
+					if (ta_work.u.sleep.nsecs > 0)
+						APPEND_TA_WORK();
+					start_run_ts = local_ts;
+					pending_count = 0;
+				}
+
+				/* no target; ignore */
+				if (atom->wakee == NULL || atom->wakee_event == NULL ||
+						!atom->wakee->selected) {
+					if (pending_count > 0) {
+						BUG_ON(ta_work.action != TA_BURN);
+						ta_work.u.burn.nsecs += delta_ts;
+					} else {
+						ta_work.action = TA_BURN;
+						ta_work.u.burn.nsecs = 0;
+						start_run_ts = local_ts;
+						burn_run_accum = 0;
+					}
+					pending_count++;
+					break;
+				}
+
+				/* target; need to finish the run */
+				if (pending_count > 0) {
+					BUG_ON(ta_work.action != TA_BURN);
+					ta_work.u.burn.nsecs += burn_run_accum + delta_ts;
+					if (ta_work.u.burn.nsecs > 0)
+						APPEND_TA_WORK();
+					pending_count = 0;
+				}
+
+				ta_work.action = TA_SIGNAL_ID;
+				ta_work.u.signal_id.id = GET_OPTIMIZED_ID(atom->wakee_event->wake_id);
+				start_run_ts = local_ts;
+				APPEND_TA_WORK();
+
+				/* and we're running */
+				ta_work.action = TA_BURN;
+				ta_work.u.burn.nsecs = 0;
+				start_run_ts = local_ts;
+				pending_count = 1;
+				burn_run_accum = 0;
+
+				break;
+
+			case SCHED_EVENT_MIGRATION:
+				break;
+			case SCHED_EVENT_EXIT:
+				if (pending_count > 0) {
+					if (ta_work.action == TA_BURN) {
+						ta_work.u.burn.nsecs += burn_run_accum + delta_ts;
+						if (ta_work.u.burn.nsecs > 0)
+							APPEND_TA_WORK();
+					} else if (ta_work.action == TA_SLEEP) {
+						ta_work.u.sleep.nsecs += delta_ts;
+						if (ta_work.u.sleep.nsecs > 0)
+							APPEND_TA_WORK();
+					} else {
+						BUG_ON(ta_work.action != TA_BURN &&
+						       ta_work.action != TA_SLEEP);
+					}
+					pending_count = 0;
+				}
+
+				ta_work.action = TA_EXIT;
+				ta_work.u.exit.ret = 0;
+				start_run_ts = local_ts;
+				APPEND_TA_WORK();
+				break;
+			case SCHED_EVENT_FORK_PARENT:
+				BUG_ON(atom->child == NULL);
+
+				if (pending_count > 0) {
+					BUG_ON(ta_work.action != TA_BURN);
+					ta_work.u.burn.nsecs += delta_ts;
+
+					if (atom->child->selected) {
+						ta_work.u.burn.nsecs += burn_run_accum;
+						if (ta_work.u.burn.nsecs > 0)
+							APPEND_TA_WORK();
+						pending_count = 0;
+					} else
+						pending_count++;
+				}
+
+				if (!atom->child->selected) {
+					if (verbose)
+						printf("Forking parent (%s/%ld) but child (%s/%ld) not selected; ignoring\n",
+							task->comm, task->pid, atom->child->comm, atom->child->pid);
+					break;
+				}
+
+				ta_work.action = TA_CLONE_PARENT;
+				ta_work.u.clone_parent.child_pid = atom->pid;
+				start_run_ts = local_ts;
+				APPEND_TA_WORK();
+
+				/* and we're running */
+				ta_work.action = TA_BURN;
+				ta_work.u.burn.nsecs = 0;
+				start_run_ts = local_ts;
+				pending_count = 1;
+				burn_run_accum = 0;
+
+				task_state = 0;	/* running */
+
+				break;
+			case SCHED_EVENT_FORK_CHILD:
+				BUG_ON(eventnr != 0);	/* this _must_ be the first one */
+				BUG_ON(atom->parent == NULL);
+
+				if (!atom->parent->selected) {
+					if (verbose)
+						printf("Forking child (%s/%ld) but parent (%s/%ld) not selected; converting to run\n",
+							task->comm, task->pid, atom->parent->comm, atom->parent->pid);
+
+					task_state = 0;
+					ta_work.action = TA_BURN;
+					ta_work.u.burn.nsecs = 0;
+					start_run_ts = local_ts;
+					pending_count = 1;
+					burn_run_accum = 0;
+					break;
+				}
+
+				ta_work.action = TA_CLONE_CHILD;
+				ta_work.u.clone_child.parent_pid = atom->pid;
+				start_run_ts = local_ts;
+				APPEND_TA_WORK();
+
+				/* and we're running */
+				ta_work.action = TA_BURN;
+				ta_work.u.burn.nsecs = 0;
+				start_run_ts = local_ts;
+				pending_count = 1;
+				burn_run_accum = 0;
+
+				task_state = 0;	/* running */
+
+				break;
+			default:
+				break;
+		}
+
+		if (preserve_time || !last_was_runnable)
+			local_ts += delta_ts;
+
+		last_blocked = blocked;
+	}
+
+	/* stop any runs */
+	if (pending_count > 0) {
+
+		BUG_ON(ta_work.action != TA_BURN && ta_work.action != TA_SLEEP);
+
+		/* if the process was running extend till end */
+		delta_ts = !exited ? (replay_end_time - (replay_ts + replay_start_time)) : 0;
+
+		if (ta_work.action == TA_BURN) {
+			ta_work.u.burn.nsecs += burn_run_accum + delta_ts;
+			if (ta_work.u.burn.nsecs > 0)
+				APPEND_TA_WORK();
+		} else if (ta_work.action == TA_SLEEP) {
+			ta_work.u.sleep.nsecs += delta_ts;
+			if (ta_work.u.sleep.nsecs > 0)
+				APPEND_TA_WORK();
+		}
+		pending_count = 0;
+	}
+
+	ta_work.action = TA_END;
+	ta_work.u.exit.ret = 0;
+	start_run_ts = local_ts;
+	APPEND_TA_WORK();
+
+	return t;
+}
+
+static int read_spr_program(const char *file, struct task ***ttt)
+{
+	FILE *fp;
+	char buf[BUFSIZ];
+	char orig_buf[BUFSIZ];
+	char *name;
+	struct task *t, **tt;
+	struct task_action ta_work, *ta_alloc;
+	int task_count, task_max;
+	int ta_count, ta_max;
+	char *s, *e, *start, *end;
+	int line, pid;
+
+#undef APPEND_TASK
+#define APPEND_TASK() \
+	do { \
+		if (task_count >= task_max) { \
+			tt = realloc(tt, (task_max + 32) * (sizeof(*tt))); \
+			BUG_ON(tt == NULL); \
+			task_max += 32; \
+		} \
+		tt[task_count++] = t; \
+	} while (0)
+
+#undef APPEND_TA_WORK
+#define APPEND_TA_WORK() \
+	do { \
+		if (ta_count >= ta_max) { \
+			ta_alloc = realloc(ta_alloc, (ta_max + 4096) * (sizeof(*ta_alloc))); \
+			BUG_ON(ta_alloc == NULL); \
+			ta_max += 4096; \
+		} \
+		ta_alloc[ta_count++] = ta_work; \
+	} while (0)
+
+
+	fp = fopen(file, "ra");
+	if (fp == NULL) {
+		fprintf(stderr, "Could not open file '%s'\n", file);
+		return -1;
+	}
+
+	task_count = 0;
+	task_max = 0;
+	tt = NULL;
+	ta_count = 0;
+	ta_max = 0;
+	ta_alloc = NULL;
+	pid = -1;
+	name = NULL;
+
+	t = NULL;
+	line = 0;
+	while (fgets(buf, sizeof(buf) - 1, fp) != NULL) {
+		line++;
+
+		buf[sizeof(buf) - 1] = '\0';
+
+		/* remove trailing newline */
+		s = strrchr(buf, '\n');
+		if (s != NULL)
+			*s = '\0';
+
+		strcpy(orig_buf, buf);
+
+		/* remove comments */
+		s = strchr(buf, '#');
+		if (s != NULL)
+			*s = '\0';
+
+		/* remove trailing spaces */
+		s = buf + strlen(buf) - 1;
+		while (s > buf && isspace(*s))
+			*s-- = '\0';
+
+		/* skip over spaces in the front */
+		s = buf;
+		while (isspace(*s))
+			s++;
+		start = s;
+		end = s + strlen(s);
+
+		/* empty line */
+		if (start >= end - 1)
+			continue;
+
+		/* waiting for task */
+		if (t == NULL) {
+			if (*start != '[' || end[-1] != ']')
+				goto syntax_error;
+			s = start + 1;
+			e = strrchr(s, '/');
+			if (e == NULL)
+				goto syntax_error;
+			if (e - s < 1)
+				goto syntax_error;
+
+			name = malloc(e - s + 1);
+			BUG_ON(name == NULL);
+
+			memcpy(name, s, e - s);
+			name[e - s] = '\0';
+
+			pid = atoi(e + 1);
+			if (pid < 0)
+				goto syntax_error;
+
+			ta_count = 0;
+			ta_max = 0;
+			ta_alloc = NULL;
+
+			/* allocate but don't do anything with it */
+			t = malloc(sizeof(*t));
+			BUG_ON(t == NULL);
+
+		} else {
+			s = start;
+			e = start;
+			while (!isspace(*e))
+				e++;
+			*e++ = '\0';
+			if (strcmp(s, "burn") == 0) {
+				ta_work.action = TA_BURN;
+				ta_work.u.burn.nsecs = strtoull(e, NULL, 10);
+			} else if (strcmp(s, "sleep") == 0) {
+				ta_work.action = TA_SLEEP;
+				ta_work.u.sleep.nsecs = strtoull(e, NULL, 10);
+			} else if (strcmp(s, "spawn") == 0) {
+				ta_work.action = TA_SPAWN;
+				ta_work.u.spawn.nsecs = strtoull(e, NULL, 10);
+			} else if (strcmp(s, "clone-parent") == 0) {
+				ta_work.action = TA_CLONE_PARENT;
+				ta_work.u.clone_parent.child_pid = (int)strtoul(e, NULL, 10);
+			} else if (strcmp(s, "clone-child") == 0) {
+				ta_work.action = TA_CLONE_CHILD;
+				ta_work.u.clone_child.parent_pid = (int)strtoul(e, NULL, 10);
+			} else if (strcmp(s, "wait-id") == 0) {
+				ta_work.action = TA_WAIT_ID;
+				ta_work.u.wait_id.id = strtoul(e, NULL, 10);
+			} else if (strcmp(s, "signal-id") == 0) {
+				ta_work.action = TA_SIGNAL_ID;
+				ta_work.u.signal_id.id = strtoul(e, NULL, 10);
+			} else if (strcmp(s, "exit") == 0) {
+				ta_work.action = TA_EXIT;
+				ta_work.u.exit.ret = (int)strtoul(e, NULL, 10);
+			} else if (strcmp(s, "end") == 0) {
+				ta_work.action = TA_END;
+			} else
+				goto syntax_error;
+
+			APPEND_TA_WORK();
+
+			if (ta_work.action == TA_END) {
+				BUG_ON(t == NULL);
+				BUG_ON(name == NULL);
+				BUG_ON(pid < 0);
+
+				if (ta_count == 0)
+					goto syntax_error;
+
+				t->name = name;
+				t->pid = pid;
+				t->actions = ta_alloc;
+
+				APPEND_TASK();
+				t = NULL;
+			}
+		}
+	}
+	t = NULL;
+	APPEND_TASK();
+
+	fclose(fp);
+
+	/* no program */
+	if (tt == NULL)
+		return -1;
+
+	*ttt = tt;
+	return 0;
+
+syntax_error:
+	fprintf(stderr, "syntax error at line %d: %s\n", line, orig_buf);
+	*ttt = NULL;
+	fclose(fp);
+	return -1;
+}
+
+static void __cmd_spr_replay(void)
+{
+	struct task_desc *task;
+	struct task **tt;
+	const struct task * const *task_array;
+	int matched, excluded;
+	unsigned long i, j;
+	int selected_task_count;
+	struct playback *p;
+	unsigned long *map_id_fwd;
+	unsigned long next_opt_id;
+	struct sched_atom *atom;
+	u64 run_nsecs;
+	int ret;
+
+	/* mark that we're not the standard replay */
+	spr_replay = true;
+
+	if (!dry_run && !(spr_list || spr_select_list)) {
+		calibrate_run_measurement_overhead();
+		calibrate_sleep_measurement_overhead();
+
+		test_calibrations();
+	}
+
+	if (spr_filename == NULL) {
+
+		read_events(true, NULL);
+
+		if (debug > 0) {
+			printf("nr_run_events:        %ld\n", nr_run_events);
+			printf("nr_sleep_events:      %ld\n", nr_sleep_events);
+			printf("nr_wakeup_events:     %ld\n", nr_wakeup_events);
+
+			if (targetless_wakeups)
+				printf("target-less wakeups:  %ld\n", targetless_wakeups);
+			if (multitarget_wakeups)
+				printf("multi-target wakeups: %ld\n", multitarget_wakeups);
+			if (nr_run_events_optimized)
+				printf("run atoms optimized: %ld\n",
+					nr_run_events_optimized);
+		}
+
+		calculate_replay_start_time();
+		calculate_replay_end_time();
+
+		selected_task_count = 0;
+		for (i = 0; i < nr_tasks; i++) {
+			task = tasks[i];
+
+			matched = match_task(&select_list, task->comm, task->pid, 1);
+			excluded = match_task(&exclude_list, task->comm, task->pid, 0);
+
+			task->selected = matched && !excluded;
+			selected_task_count += task->selected;
+		}
+
+		tt = zalloc((selected_task_count + 1) * sizeof(*tt));
+		BUG_ON(tt == NULL);
+
+		if (spr_list || spr_select_list) {
+			for (j = 0; j < nr_tasks; j++) {
+				task = tasks[j];
+
+				/* task with no events, doesn't show up */
+				if (task->nr_events == 0)
+					continue;
+
+				/* we have a match list */
+				if (spr_select_list && !task->selected)
+					continue;
+
+				run_nsecs = 0;
+				for (i = 0; i < task->nr_events; i++) {
+					atom = task->atoms[i];
+					if (atom->type == SCHED_EVENT_EXIT)
+						break;
+					if (atom->type == SCHED_EVENT_RUN)
+						run_nsecs += atom->duration;
+				}
+
+				printf("[%s/%ld] %c R:%" PRIu64 "\n",
+					task->comm, task->pid,
+					task->selected ? 'S' : '-',
+					run_nsecs);
+			}
+		}
+
+		if (debug > 1) {
+			printf("Dump of scheduling atoms\n");
+			for (i = 0; i < nr_tasks; i++) {
+				task = tasks[i];
+
+				if (task->nr_events == 0 || !task->selected)
+					continue;
+
+				printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
+					task->nr, task->comm, task->pid, task->nr_events);
+
+				dump_sched_atom_task(task);
+				printf("\n");
+			}
+			printf("\n");
+		}
+
+		/* map of a wake id to an optimized wake id */
+		map_id_fwd = zalloc((next_wake_id + 1) * sizeof(*map_id_fwd));
+		BUG_ON(map_id_fwd == NULL);
+
+		next_opt_id = 0;
+
+		if (verbose)
+			printf("Generating replay program\n");
+
+		for (i = 0, j = 0; i < nr_tasks; i++) {
+			task = tasks[i];
+
+			if (task->nr_events == 0 || !task->selected)
+				continue;
+
+			if (verbose)
+				printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
+					task->nr, task->comm, task->pid, task->nr_events);
+
+			tt[j] = generate_spr_program(task, map_id_fwd, &next_opt_id);
+			BUG_ON(tt[j] == NULL);
+
+			if (debug > 1) {
+				dump_task(tt[j]);
+				printf("\n");
+			}
+
+			j++;
+		}
+
+		if (generate) {
+			for (i = 0; tt[i] != NULL; i++)
+				generate_task(tt[i]);
+		}
+
+		free(map_id_fwd);
+
+	} else {
+		ret = read_spr_program(spr_filename, &tt);
+		BUG_ON(ret != 0);
+
+		if (generate) {
+			for (i = 0; tt[i] != NULL; i++)
+				generate_task(tt[i]);
+		}
+
+	}
+
+	if (!dry_run) {
+
+		if (bogoburn) {
+
+			if (bogoloops == 0)
+				calculate_bogoloops_value();
+
+			if (debug > 0)
+				printf("bogoloops at %" PRIu64 "\n", bogoloops);
+		}
+
+		task_array = (void *)tt;
+
+		p = playback_create(task_array);
+		BUG_ON(p == NULL);
+
+		playback_set_debug_level(p, debug);
+
+		if (verbose)
+			printf("Running...\n");
+		playback_run(p);
+
+		if (verbose)
+			printf("Done...\n");
+
+		playback_destroy(p);
+	}
+
+	/* free */
+	for (i = 0; tt[i] != NULL; i++) {
+		free((void *)tt[i]->actions);
+		free((void *)tt[i]->name);
+		free(tt[i]);
+	}
+	free(tt);
+}
+
 int cmd_sched(int argc, const char **argv, const char *prefix __used)
 {
    argc = parse_options(argc, argv, sched_options, sched_usage,
@@ -1920,6 +4224,14 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
    			usage_with_options(replay_usage, replay_options);
    	}
    	__cmd_replay();
+	} else if (!strncmp(argv[0], "spr-rep", 7)) {
+		trace_handler = &replay_ops;
+		if (argc) {
+			argc = parse_options(argc, argv, spr_replay_options, spr_replay_usage, 0);
+			if (argc)
+				usage_with_options(spr_replay_usage, spr_replay_options);
+		}
+		__cmd_spr_replay();
    } else {
    	usage_with_options(sched_usage, sched_options);
    }
-- 
1.7.1