perf kmem stat: Track memory freed

Track freed memory as well as allocations and show the net in the
summary.

Committer notes:

Testing it:

  # perf kmem record usleep 1
  [ perf record: Woken up 0 times to write data ]
  [ perf record: Captured and wrote 1.626 MB perf.data (4208 samples) ]
  [root@jouet ~]# perf kmem stat --slab

  SUMMARY (SLAB allocator)
  ========================
  Total bytes requested: 234,011
  Total bytes allocated: 234,504
  Total bytes freed:     213,328                                 <------
  Net total bytes allocated: 21,176
  Total bytes wasted on internal fragmentation: 493
  Internal fragmentation: 0.210231%
  Cross CPU allocations: 4/1,963
  #

Signed-off-by: David Ahern <dsahern@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1480110133-37039-1-git-send-email-dsahern@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index d426dcb..7fd6f1e 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -49,6 +49,7 @@ struct alloc_stat {
 	u64	ptr;
 	u64	bytes_req;
 	u64	bytes_alloc;
+	u64	last_alloc;
 	u32	hit;
 	u32	pingpong;
 
@@ -62,7 +63,7 @@ static struct rb_root root_alloc_sorted;
 static struct rb_root root_caller_stat;
 static struct rb_root root_caller_sorted;
 
-static unsigned long total_requested, total_allocated;
+static unsigned long total_requested, total_allocated, total_freed;
 static unsigned long nr_allocs, nr_cross_allocs;
 
 static int insert_alloc_stat(unsigned long call_site, unsigned long ptr,
@@ -105,6 +106,8 @@ static int insert_alloc_stat(unsigned long call_site, unsigned long ptr,
 	}
 	data->call_site = call_site;
 	data->alloc_cpu = cpu;
+	data->last_alloc = bytes_alloc;
+
 	return 0;
 }
 
@@ -223,6 +226,8 @@ static int perf_evsel__process_free_event(struct perf_evsel *evsel,
 	if (!s_alloc)
 		return 0;
 
+	total_freed += s_alloc->last_alloc;
+
 	if ((short)sample->cpu != s_alloc->alloc_cpu) {
 		s_alloc->pingpong++;
 
@@ -1128,6 +1133,11 @@ static void print_slab_summary(void)
 	printf("\n========================\n");
 	printf("Total bytes requested: %'lu\n", total_requested);
 	printf("Total bytes allocated: %'lu\n", total_allocated);
+	printf("Total bytes freed:     %'lu\n", total_freed);
+	if (total_allocated > total_freed) {
+		printf("Net total bytes allocated: %'lu\n",
+		total_allocated - total_freed);
+	}
 	printf("Total bytes wasted on internal fragmentation: %'lu\n",
 	       total_allocated - total_requested);
 	printf("Internal fragmentation: %f%%\n",