/*
 *  kernprof.c - control kernel profiling
 *
 *  Copyright (C) 1999, 2000, 2001 SGI
 *
 *  Written by Dimitris Michailidis (dimitris@engr.sgi.com)
 *  Modifications by John Hawkes (hawkes@engr.sgi.com)
 *  Contributions from Niels Christiansen (nchr@us.ibm.com)
 *  Contributions from Ethan Solomita (ethan@cs.columbia.edu)
 *
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

#define __SMP__
#define CONFIG_MCOUNT 1

#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/uio.h>
#include <sys/gmon.h>
#include <fcntl.h>
#include <time.h>
#include <signal.h>
#include <sched.h>
#include <linux/profile.h>
#include <linux/threads.h>
#include <linux/module.h>
#include "gmon_out.h"

#define VERSION "1.4"

#define SLEEP_MSEC 200
#define DEFAULT_MAP "/usr/src/linux/System.map"
#define DEFAULT_KERNEL "/usr/src/linux/vmlinux"
#define defaultGprofOutFile "gmon.out"

#define PTR_ALIGNED  __attribute__ ((aligned (__alignof__ (char *))))

typedef unsigned long address_t;  /* What user space considers an address */

/*
 * kaddress_t is what the kernel considers an address.  It may differ from 
 * address_t if kernel- and user-space addresses are of different size.
 */
#if defined(__mips__) || defined(__sparc__)
typedef unsigned long long kaddress_t;
#define SCANF_PREFIX "l"
#else
typedef address_t kaddress_t;
#define SCANF_PREFIX ""
#endif

/* This must match the kernel's definition. */
struct cg_arc_dest {
   kaddress_t address;
   int count;
   unsigned short link;
   unsigned short pad;
};

/*
 * Type for CPU bitmaps in the kernel.  Presently the same as kaddress_t.
 * Will need to be changed once we start supporting really large CPU counts.
 */
typedef kaddress_t cpu_map_t;

/*
 * Stores memory mapping of profiler buckets
 */
prof_mem_map_t  memory_map;

/* These should match the kernel's definition. */
#define PROF_SET_ENABLE_MAP     _IOW(0xAF, 13, cpu_map_t)
#define PROF_GET_ENABLE_MAP     _IOR(0xAF, 14, cpu_map_t)
#define PROF_GET_MAPPING        _IOR(0xAF, 15, long)

char *prgname;
int hasSymlib = 0;
int recv_ctlC = 0;                      /* true if we got ^C */
unsigned long sleep_msec = SLEEP_MSEC;  /* sleep interval */
int ctl_fd;

void usage()
{
   fprintf(stderr,
	   "Kernprof version " VERSION "\n"
	   "Usage: %s [options]\n"
	   "\t -a <event>        event for the PMC domain\n"
	   "\t -b                start profiling\n"
	   "\t -c <cpu> | all    enable profiling on CPU <cpu>\n"
	   "\t -d [time | pmc]   select profiling domain\n"
	   "\t -e                stop profiling\n"
	   "\t -f <sample_freq>  PC sampling frequency\n"
	   "\t -g                write call graph data to <outfile> (\"-o <outfile>\")\n"
	   "\t -h                this message\n"
	   "\t -i                show PC-sample or call-count profile\n"
	   "\t -m <mapfile>      (default: \"%s\")\n"
	   "\t -o <outfile>      (default: \"%s\")\n"
	   "\t -p <pid>          specify a single pid to be profiled\n"
	   "\t -r                reset profiling data structures\n"
	   "\t -s                display profiling status\n"
	   "\t -t <mode>         select profiling mode\n"
	   "\t -v                verbose PC sample output\n"
	   "\t -w <msec>         sleep interval for trace collection (default: %u)\n\n"
	   "<mode> can be one of\n"
	   "\t pc                PC sampling\n"
	   "\t cc                call count\n"
	   "\t cg                call graph\n"
	   "\t acg               annotated call graph\n"
	   "\t ct                call backtrace\n",
	   prgname, DEFAULT_MAP, defaultGprofOutFile, SLEEP_MSEC);
   exit(1);
}

void err_exit(const char *s) 
{
   perror(s);
   exit(1);
}

char *decode_prof_mode(int mode)
{
   switch (mode) {
      case PROF_MODE_PC_SAMPLING: return "PC sampling";
      case PROF_MODE_CALL_GRAPH: return "call graph";
      case PROF_MODE_CALL_GRAPH | PROF_MODE_PC_SAMPLING:
         return "annotated call graph";
      case PROF_MODE_BACKTRACE: return "call backtrace";
      case PROF_MODE_CALL_COUNT: return "call count";
      case PROF_MODE_SCHEDULER_CALL_GRAPH: return "scheduler call graph";
   }
   return "unknown";
}

char *decode_prof_domain(int domain)
{
   switch (domain) {
      case PROF_DOMAIN_TIME: return "time";
      case PROF_DOMAIN_PERFCTR: return "PMC";
   }
   return "unknown";
}

void prof_ioctl(int request, unsigned long arg)
{
   if (ioctl(ctl_fd, request, arg))
      err_exit("/dev/profile: ioctl");
}

void set_prof_freq(unsigned int freq)
{
   if (ioctl(ctl_fd, PROF_SET_SAMPLE_FREQ, freq)) {
      fprintf(stderr, "%s: frequency %u out of range\n", prgname, freq);
      exit(1);
   }
}

void set_prof_mode(unsigned int mode)
{
   if (ioctl(ctl_fd, PROF_SET_MODE, mode)) {
      fprintf(stderr, "%s: kernel does not support %s mode\n", prgname, 
	      decode_prof_mode(mode));
      exit(1);
   }
}

void set_prof_pid(unsigned int pid)
{
   if (ioctl(ctl_fd, PROF_SET_PID, pid)) {
      fprintf(stderr, "%s: cannot set pid while profiling\n", prgname);
      exit(1);
   }
}

void set_prof_domain(unsigned int domain)
{
   if (ioctl(ctl_fd, PROF_SET_DOMAIN, domain)) {
      fprintf(stderr, "%s: kernel does not support %s domain\n", prgname, 
	      decode_prof_domain(domain));
      exit(1);
   }
}

/*
 * User can specify the lower 20 bits that will be loaded into PerfEvtSel0.
 * IA32 specific.
 */
void set_PMC_event(unsigned int event)
{
   if (event > 0xFFFFF) {
      fprintf(stderr, "%s: illegal event %#x\n", prgname, event);
      exit(1);
   }
   if (event < 0x100)
      event |= 0x30000;
   if (ioctl(ctl_fd, PROF_SET_PERFCTR_EVENT, (unsigned long) event)) {
      fprintf(stderr, "%s: %#x: illegal event or PMC domain not supported\n",
	      prgname, event);
      exit(1);
   }
}

int get_prof_info(int entry)
{
   int val;
   
   prof_ioctl(entry, (unsigned long) &val);
   return val;
}

cpu_map_t get_prof_info_bitmap(int entry)
{
   cpu_map_t val;

   prof_ioctl(entry, (unsigned long) &val);
   return val;
}

void show_prof_status(void)
{
   int domain = get_prof_info(PROF_GET_DOMAIN);
   int event, ret;
   
   printf("profiling mode: %s, domain: %s, status: %s, cpu map: %#"
	  SCANF_PREFIX "lx, pid: %d\n",
	  decode_prof_mode(get_prof_info(PROF_GET_MODE)),
	  decode_prof_domain(domain),
	  get_prof_info(PROF_GET_ON_OFF_STATE) ? "on" : "off",
	  get_prof_info_bitmap(PROF_GET_ENABLE_MAP),
	  get_prof_info(PROF_GET_PID));
   printf("PC sampling resolution: %i bytes, sampling frequency: %i %s\n",
	  get_prof_info(PROF_GET_PC_RES),
	  get_prof_info(PROF_GET_SAMPLE_FREQ),
	  domain == PROF_DOMAIN_TIME ? "Hz" : "events");
   ret = ioctl(ctl_fd, PROF_GET_PERFCTR_EVENT, (unsigned long) &event);
   if (!ret)
      printf("PMC event: %#x\n", event);
   else
      printf("PMC domain not supported\n");
}

/*
 * Read the contents of a file into a dynamically allocated buffer.
 * Also return the size of the buffer.
 */
char *read_file(const char *name, size_t *lenp)
{
   char *buf;
   int fd;
   off_t end;
   size_t len;
   
   if ((fd = open(name, O_RDONLY)) < 0 || (end = lseek(fd, 0, SEEK_END)) < 0 ||
       lseek(fd, 0, SEEK_SET) < 0)
      err_exit(name);

   len = (size_t) end;
   if ((buf = malloc(len)) == NULL)
      err_exit("malloc");

   if (read(fd, buf, len) != len)
   {
      fprintf(stderr, "Can't read %s; did you run kernprof -b?\n", name);
      err_exit(name);
   }

   close(fd);
   if (lenp) *lenp = len;
   return buf;
}

/* Get a symbol's address from a file that obeys the format of /proc/ksyms */
address_t get_symbol_address(FILE *fp, const char *name)
{
   char mapline[256], sym_name[256];
   kaddress_t addr;
   
   while (fgets(mapline, sizeof mapline, fp)) {
      if (sscanf(mapline, "%" SCANF_PREFIX "lx %s", &addr, sym_name) != 2) {
	 fprintf(stderr,"%s: corrupt map file\n", prgname);
	 exit(1);
      }
      if (!strncmp(sym_name, name, strlen(name)))
	 return (address_t) addr;
   }
   return 0;
}

void output_pc_profile(const char *mapFile, const char *outFile, int verbose)
{
   unsigned int step, index;
   ulong total_ticks = 0;
   address_t addr0 = 0;
   address_t addr, next_addr;        /* current and next symbol addresses */
   char name[256], next_name[256];   /* current and next symbol names */
   char mode[8], mapline[256];
   FILE *map, *out;
   PC_sample_count_t *samples;

   samples = (PC_sample_count_t *)read_file("/proc/profile/PC_samples", NULL);
   if ((map = fopen(mapFile, "r")) == NULL)
      err_exit(mapFile);
   if (outFile == NULL)
      out = stdout;
   else if ((out = fopen(outFile, "w")) == NULL)
      err_exit(outFile);
   
   step = get_prof_info(PROF_GET_PC_RES);

   index = 0;
   while (fgets(mapline, sizeof mapline, map)) {
      unsigned int start_idx, end_idx, tot_samples = 0;

      if (sscanf(mapline, "%x %s %s", &next_addr, mode, next_name) != 3) {
	 fprintf(stderr, "%s: %s: corrupt map file\n", prgname, mapFile);
	 exit(1);
      }

      if (addr0 == 0) {                               /* bootstrapping */
	 if (strcmp(next_name, "_stext")) continue;
	 addr0 = addr = next_addr;
	 strcpy(name, next_name);
	 continue;
      }

      start_idx = index;
      end_idx = (next_addr - addr0) / step;
      while (index < end_idx) {
	 tot_samples += samples[index++];
      }
      if (tot_samples > 0) {
         total_ticks += tot_samples;
	 fprintf(out, "%s [%08lx]: %u\n", name, addr, tot_samples);
	
	 /* Process the bins a second time to report samples per address */
	 if (verbose)
	    for (index = start_idx; index < end_idx; ++index)
  	       if (samples[index] != 0)
	          fprintf(out, "    %08lx\t%u\n", addr0 + index * step,
			  samples[index]);
      }

      addr = next_addr;
      strcpy(name, next_name);
      if (*mode == '?' || !strcmp(name, "_etext")) /* only text is profiled */
         break;
   }
  
   if (total_ticks)
      fprintf(out, "%s          %lu\n", "TOTAL_SAMPLES", total_ticks);
   if (outFile)
      fclose(out);
#if 0
   fclose(map);
#endif
   free(samples);
}

#define SET_PROF_RATE(hdr) \
{ \
   int rate = get_prof_info(PROF_GET_SAMPLE_FREQ); \
   if (get_prof_info(PROF_GET_DOMAIN) == PROF_DOMAIN_TIME) { \
      *(int *) (hdr).prof_rate = rate; \
      strncpy((hdr).dimen, "seconds", sizeof (hdr).dimen); \
      (hdr).dimen_abbrev = 's'; \
   } else { \
      *(int *) (hdr).prof_rate = -rate; \
      snprintf((hdr).dimen, sizeof (hdr).dimen, "events (%#x)", get_prof_info(PROF_GET_PERFCTR_EVENT)); \
      (hdr).dimen_abbrev = 'e'; \
   } \
}

void write_gprof_pc_hist(
	int fd,
	address_t lowpc,
	address_t highpc,
	PC_sample_count_t *buf,
	int hist_size,
	int start_index)
{
   u_char tag = GMON_TAG_TIME_HIST;
   struct gmon_hist_hdr thdr PTR_ALIGNED;
   char *cbuf;
   size_t len;
   int end = start_index + hist_size;
   int need_free = 0;
   
   if (buf == NULL) {
      buf = (PC_sample_count_t *) read_file("/proc/profile/PC_samples", &len);
      if (hist_size != len / sizeof(PC_sample_count_t)) {
	fprintf(stderr,"write_gprof_pc_hist: hist_size:%d len/sizeof(PC_sample_count_t:%d\n",
		hist_size, len / sizeof(PC_sample_count_t) );
      }
      need_free = 1;
   }
   cbuf = (char *)buf;
   
   /* gprof history buckets are of type HISTCOUNTER so we may need to convert
    * our samples
    */
   if (sizeof(HISTCOUNTER) < sizeof(PC_sample_count_t)) {
      HISTCOUNTER *b;
      PC_sample_count_t *p;
      int i;

      b = (HISTCOUNTER *) p = buf;
      for (i = start_index; i < end; ++i)
	  b[i] = p[i];
   }
   
   {
      struct iovec iov[3] = {
	 { &tag, sizeof(tag) },
	 { &thdr, sizeof(struct gmon_hist_hdr) },
	 { cbuf + start_index * sizeof(HISTCOUNTER), hist_size * sizeof(HISTCOUNTER) }
      };
      
      *(char **) thdr.low_pc = (char *) lowpc;
      *(char **) thdr.high_pc = (char *) highpc;
      *(int *) thdr.hist_size = hist_size;
      SET_PROF_RATE(thdr);
      
      writev(fd, iov, 3);
   }
   if (need_free)
      free(buf);
}

/* round x up to a multiple of n.  n must be a power of 2 */
static inline size_t roundup(size_t x, int n)
{
   return (x + n - 1) & ~(n - 1);
}

#define NARCS_PER_WRITEV 32

void write_gprof_call_graph(
	int fd,
	address_t lowpc,
	address_t highpc,
	unsigned short *froms,
	int from_len,
	struct cg_arc_dest *tos)
{
   u_char tag = GMON_TAG_CG_ARC;
   struct gmon_cg_arc_record raw_arc[NARCS_PER_WRITEV] PTR_ALIGNED;
   int from_index, nfilled, step;
   int need_free = 0;
   unsigned short *fromc;
   struct cg_arc_dest *toc;
   struct iovec iov[2 * NARCS_PER_WRITEV];
   char *buf = NULL;
   char *b;
   char *p;
   int cpu;
   int to_index;
   int next_index;
   int i;
   int curix;
   address_t frompc;
   long total_count = 0L;
   long cg_input = 0L;
   long cg_merged = 0L;
   long cg_base = 0L;
   long cg_records = 0L;
   long lost_ones = 0L;
   
   for (nfilled = 0; nfilled < NARCS_PER_WRITEV; ++nfilled) {
      iov[2 * nfilled].iov_base = &tag;
      iov[2 * nfilled].iov_len = sizeof tag;
      iov[2 * nfilled + 1].iov_base = &raw_arc[nfilled];
      iov[2 * nfilled + 1].iov_len = sizeof(struct gmon_cg_arc_record);
   }
   
   nfilled = 0;
   step = get_prof_info(PROF_GET_PC_RES);
   if (froms == NULL) {
      need_free = 1;
      buf = read_file("/proc/profile/call_graph", NULL);
      froms = (unsigned short *) buf;
      from_len = (highpc - lowpc) / step;
      p = buf + memory_map.cg_to_offset;
      tos = (struct cg_arc_dest *)p;
   }
   b = buf;
   curix = tos[0].count + 1;

#if 0
   for (cpu = 0; cpu < memory_map.nr_cpus; cpu++)
   {
      for (from_index = 0; from_index < from_len; ++from_index)
      {
         frompc = lowpc + from_index * step;
   
         for (to_index = froms[from_index]; to_index != 0;
	      to_index = tos[to_index].link) {
	    *(address_t *) raw_arc[nfilled].from_pc = frompc;
	    *(address_t *) raw_arc[nfilled].self_pc =
	       (address_t) tos[to_index].address;
	    *(int *) raw_arc[nfilled].count = tos[to_index].count;
            total_count += tos[to_index].count;
            cg_records++;
	    if (++nfilled == NARCS_PER_WRITEV) {
	       writev (fd, iov, 2 * NARCS_PER_WRITEV);
	       nfilled = 0;
	    }
         }
      }
      b += memory_map.cg_from_size;
      froms = (unsigned short *) b;
      p += memory_map.cg_to_size;
      tos = (struct cg_arc_dest *)p;
   }
   if (nfilled > 0)
      writev(fd, iov, 2 * nfilled);
   free(buf);
   printf("Total call trace count:   %7ld\n", total_count);
   printf("Total call graph records: %7ld\n", cg_records);
#else
   for (cpu = 0; cpu < memory_map.nr_cpus; cpu++)
   {
      b = buf + memory_map.cg_from_size * cpu;
      fromc = (unsigned short *)b;
      p = buf + memory_map.cg_to_offset + memory_map.cg_to_size * cpu;
      toc = (struct cg_arc_dest *)p;
      for (from_index = 0; from_index < from_len; ++from_index)
      {
         frompc = lowpc + from_index * step;
         for (to_index = fromc[from_index]; to_index != 0; to_index = toc[to_index].link)
         {
            cg_input++;
            if (!cpu)
            {
               cg_base++;
               continue;
            }
            for (i = froms[from_index]; i != 0; i = tos[i].link)
            {
               if (tos[i].address == toc[to_index].address)
               {
                  cg_merged++;
                  tos[i].count += toc[to_index].count;
                  break;
               }
            }
            if (i == 0)
            {
               if (curix >= CG_MAX_ARCS)
                  lost_ones++;
               else
               {
                  tos[curix].link = froms[from_index];
                  tos[curix].address = toc[to_index].address;
                  tos[curix].count = toc[to_index].count;
                  froms[from_index] = curix++;
               }
            }
         }
      }
   }
   for (from_index = 0; from_index < from_len; ++from_index)
   {
      frompc = lowpc + from_index * step;

      for (to_index = froms[from_index]; to_index != 0;
           to_index = tos[to_index].link)
      {
         *(address_t *) raw_arc[nfilled].from_pc = frompc;
         *(address_t *) raw_arc[nfilled].self_pc =
            (address_t) tos[to_index].address;
         *(int *) raw_arc[nfilled].count = tos[to_index].count;
         total_count += tos[to_index].count;
         cg_records++;
         if (++nfilled == NARCS_PER_WRITEV)
         {
            writev (fd, iov, 2 * NARCS_PER_WRITEV);
            nfilled = 0;
         }
      }
   }
   
   if (nfilled > 0)
      writev(fd, iov, 2 * nfilled);
   if (need_free)
      free(buf);
   printf("Total call trace count:    %7ld\n", total_count);
   if (memory_map.nr_cpus)
   {
      printf("Input call graph records:  %7ld\n", cg_input);
      printf("CPU-0 call graph records:  %7ld\n", cg_base);
      printf("Merged call graph records: %7ld\n", cg_merged);
   }
   printf("Output call graph records: %7ld\n", cg_records);
   if (lost_ones)
      printf("Lost call graph records:   %7ld\n", lost_ones);
#endif
}

/* write gmon.out header */
void write_gprof_hdr(int fd)
{
    struct gmon_hdr ghdr PTR_ALIGNED;

    memset(&ghdr, 0, sizeof ghdr);
    memcpy(&ghdr.cookie[0], GMON_MAGIC, sizeof(ghdr.cookie));
    *(int *) ghdr.version = GMON_VERSION;
    write(fd, &ghdr, sizeof ghdr);
}

/* generate a call graph data file for gprof */
void output_cg_profile(const char *outFile)
{
   int fd;
   unsigned int step;
   address_t lowpc, highpc;
   
   step = get_prof_info(PROF_GET_PC_RES);
   lowpc = memory_map.kernel_start;
   highpc = memory_map.kernel_end;

   if ((fd = creat(outFile ? outFile : defaultGprofOutFile, 0666)) == 0)
      err_exit(outFile);
   prof_ioctl(PROF_STOP, 0); /* to avoid any races and freeze the data */
   write_gprof_hdr(fd);
   write_gprof_pc_hist(fd, lowpc, highpc, NULL, memory_map.kernel_buckets, 0);
   write_gprof_call_graph(fd, lowpc, highpc, NULL, 0, NULL);
   close(fd);
}

/* convert arg into a CPU bitmap */
cpu_map_t get_cpu(const char *arg)
{
   unsigned int cpu;
   char *endp;

   if (!strcmp(arg, "all"))
       return ~0;

   cpu = strtoul(arg, &endp, 10);
   if (*endp || cpu >= 8 * sizeof(cpu_map_t)) {
      fprintf(stderr, "%s: illegal CPU id %s\n", prgname, arg);
      exit(1);
   }
   return (cpu_map_t)1 << cpu;
}

void sigint_handler(int ignore) 
{
   recv_ctlC = 1;
}

void write_gprof_call_stack_sampling_header(int fd)
{
   u_char tag = GMON_TAG_CALL_STACK_SAMPLING_HDR;
   struct gmon_call_stack_sampling_hdr_record hdr PTR_ALIGNED;
   struct iovec iov[2] = {
      { &tag, sizeof tag },
      { &hdr, sizeof hdr }
   };

   SET_PROF_RATE(hdr);
   writev (fd, iov, 2);
}

/*
 * When writing call traces we compress the output file by agreggating
 * all the module and user samples.  This is easy to do because all these
 * traces have a single entry that is either UNKNOWN_KERNEL or USER.  The two
 * variables below are initialized to the addresses of these two symbols.
 */
address_t module_addr, user_addr, stalled_addr;

void convert_to_gprof_format(const char *from, const char *to)
{
   address_t lowpc, highpc;
   int i, in, out, res, next_to, high_to, step, idx, max_idx;
   address_t upcs[PROF_BACKTRACE_MAX_LEN];
   kaddress_t len, count, kpcs[PROF_BACKTRACE_MAX_LEN];
   size_t l;
   FILE *map;
   PC_sample_count_t *pcs;
   unsigned short *froms;
   struct cg_arc_dest *tos;

   if ((map = fopen("/proc/ksyms", "r")) == NULL)
      err_exit("/proc/ksyms");
   if ((lowpc = get_symbol_address(map, "_stext")) == 0) {
      fprintf(stderr, "%s: can't find \"_stext\" in /proc/ksyms\n", prgname);
      exit(1);
   }
   if ((highpc = get_symbol_address(map, "_etext")) == 0) {
      fprintf(stderr, "%s: can't find \"_etext\" in /proc/ksyms\n", prgname);
      exit(1);
   }
   fclose(map);

   step = get_prof_info(PROF_GET_PC_RES);
   max_idx = (highpc - lowpc) / step;
   pcs = malloc(max_idx * sizeof(*pcs));
   froms = malloc(max_idx * sizeof(*froms));
   tos = malloc(64 * 1024);

   if (pcs == NULL || froms == NULL || tos == NULL)
     err_exit("malloc");
   
   next_to = 1;
   high_to = (64*1024) / sizeof(*tos);

   if ((in = open(from, O_RDONLY)) == -1)
      err_exit(from);
   if ((out = creat(to, 0666)) == -1)
      err_exit(to);
   write_gprof_hdr(out);

   while ((res = read(in, &len, sizeof len)) == sizeof len) {
      /* "len" holds the count in the upper half, len in the lower */
      count = len >> ((sizeof len) * 4);
      if (count == 0)
	 count = 1;		/* backwards compat -- "0" means 1 */
      len &= (1LL << ((sizeof len) * 4)) - 1;
      
      if (len > PROF_BACKTRACE_MAX_LEN) {
	 fprintf(stderr,
		 "%s: trace entry too long, suspect data corruption\n", from);
	 exit(1);
      }

      l = (size_t) len * sizeof(kaddress_t);

      /*
       * If kernel- and user-space addresses are the same we read straight into
       * upcs, otherwise we read the kernel addresses into a temporary buffer
       * and convert them later into addresses user applications can handle.
       */
      res = read(in, sizeof(address_t) == sizeof(kaddress_t)
			? (char *)&upcs : (char *)&kpcs, l);

      if (res != l) {
	 if (res != 0) break;
	 fprintf(stderr, "%s: premature end-of-file\n", from);
	 exit(1);
      }

      if (sizeof(address_t) != sizeof(kaddress_t)) {
	 for (i = 0; i < (int) len; i++)
		 upcs[i] = kpcs[i];
      }

      if (upcs[0] < lowpc || upcs[0] >= highpc)
	 err_exit("bad pc");
      idx = (upcs[0] - lowpc) / step;
      pcs[idx] += count;
      if (pcs[idx] < count)
	 fprintf(stderr, "%s: pc 0x%x count wrapped", from, upcs[0]);

      for (i = 1; i < (int) len; i++) {
	 int to_idx;
	 
	 if (upcs[i] < lowpc || upcs[i] >= highpc)
	    err_exit("bad pc");

	 if (next_to == high_to) {
	    high_to *= 2;
	    tos = realloc(tos, high_to * sizeof(*tos));
	 }
	 
	 idx = (upcs[i] - lowpc) / step;
	 to_idx = froms[idx];
	 while (to_idx && tos[to_idx].address != upcs[i-1])
	    to_idx = tos[to_idx].link;
	 if (to_idx)
		 tos[to_idx].count += count;
	 else {
	    to_idx = next_to++;
	    tos[to_idx].link = froms[idx];
	    tos[to_idx].address = upcs[i-1];
	    tos[to_idx].count = count;
	    froms[idx] = to_idx;
	 }
      }
   }
   if (res < 0)
      err_exit(from);
   else if (res != 0) {
      fprintf(stderr, "%s: unexpected read size\n", from);
      exit(1);
   }

#ifdef XXX
   write_gprof_pc_hist(out, lowpc, highpc, pcs, max_idx);
#endif
   write_gprof_pc_hist(out, lowpc, highpc, pcs, memory_map.kernel_buckets, 0);
   write_gprof_call_graph(out, lowpc, highpc, froms, max_idx, tos);

   close(in);
   close(out);
}

void convert_to_gprof_raw_format(const char *from, const char *to)
{
   int in, out, res, module_samples = 0, user_samples = 0, stalled_samples = 0;
   u_char tag = GMON_TAG_CALL_STACK;
   struct gmon_call_stack_record csrec PTR_ALIGNED;
   address_t pcs[PROF_BACKTRACE_MAX_LEN];
   kaddress_t len, count, kpcs[PROF_BACKTRACE_MAX_LEN];
   size_t l;

   struct iovec iov[3] = {
      { &tag, sizeof tag },
      { &csrec, sizeof csrec },
      { pcs, 0 }
   };

   if ((in = open(from, O_RDONLY)) == -1)
      err_exit(from);
   if ((out = creat(to, 0666)) == -1)
      err_exit(to);
   write_gprof_hdr(out);
   write_gprof_call_stack_sampling_header(out);

   while ((res = read(in, &len, sizeof len)) == sizeof len) {
      /* "len" holds the count in the upper half, len in the lower */
      count = len >> ((sizeof len) * 4);
      if (count == 0)
	 count = 1;		/* backwards compat -- "0" means 1 */
      len &= (1LL << ((sizeof len) * 4)) - 1;
      
      if (len > PROF_BACKTRACE_MAX_LEN) {
	 fprintf(stderr,
		 "%s: trace entry too long, suspect data corruption\n", from);
	 exit(1);
      }

      l = (size_t) len * sizeof(kaddress_t);

      /*
       * If kernel- and user-space addresses are the same we read straight into
       * pcs, otherwise we read the kernel addresses into a temporary buffer
       * and convert them later into addresses user applications can handle.
       */
      res = read(in, sizeof(address_t) == sizeof(kaddress_t)
			? (char *)&pcs : (char *)&kpcs, l);

      if (res != l) {
	 if (res != 0) break;
	 fprintf(stderr, "%s: premature end-of-file\n", from);
	 exit(1);
      }

      if (sizeof(address_t) != sizeof(kaddress_t)) {
	 int i;

	 for (i = 0; i < (int) len; i++)
		 pcs[i] = kpcs[i];
      }

      if (len == 1 && pcs[0] == module_addr)
	 module_samples += count;
      else if (len == 1 && pcs[0] == user_addr)
	 user_samples += count;
      else if (len == 1 && stalled_addr && pcs[0] == stalled_addr)
	 stalled_samples += count;
      else {
	 *(int *) csrec.count = count;
	 *(int *) csrec.stack_size = (int) len;
	 iov[2].iov_len = (size_t) len * sizeof(address_t);
	 if (writev(out, iov, 3) < 0)
	    err_exit(to);
      }
   }
   if (res < 0)
      err_exit(from);
   else if (res != 0) {
      fprintf(stderr, "%s: unexpected read size\n", from);
      exit(1);
   }

   /* Write out the aggregate user and module samples */
   iov[2].iov_len = sizeof(address_t);
   *(int *) csrec.stack_size = 1;
   if (user_samples) {
      *(int *) csrec.count = user_samples;
      pcs[0] = user_addr;
      if (writev(out, iov, 3) < 0)
         err_exit(to);
   }
   if (module_samples) {
      *(int *) csrec.count = module_samples;
      pcs[0] = module_addr;
      if (writev(out, iov, 3) < 0)
         err_exit(to);
   }
   if (stalled_samples) {
      *(int *) csrec.count = stalled_samples;
      pcs[0] = stalled_addr;
      if (writev(out, iov, 3) < 0)
         err_exit(to);
   }

   close(in);
   close(out);
}

void install_sigint_handler()
{
   struct sigaction sigact;
	
   sigact.sa_handler = &sigint_handler;
   sigemptyset(&sigact.sa_mask);
   sigact.sa_flags = 0;
   sigaction(SIGINT, &sigact, NULL);
}

void collect_call_traces(unsigned long map, const char *outFile)
{
   int n = 0, i, buf_reads = 0, ovfl = 0;
   int proc_fds[NR_CPUS], out_fds[NR_CPUS];
   char s[256];
   struct sched_param sched_prio;
   ssize_t max_read = 0;
   FILE *mapf;
   
   if ((mapf = fopen("/proc/ksyms", "r")) == NULL)
      err_exit("/proc/ksyms");
   if ((user_addr = get_symbol_address(mapf, "USER")) == 0) {
      fprintf(stderr, "%s: can't find \"USER\" in /proc/ksyms\n", prgname);
      exit(1);
   }
   if ((module_addr = get_symbol_address(mapf, "UNKNOWN_KERNEL")) == 0) {
      fprintf(stderr, "%s: can't find \"UNKNOWN_KERNEL\" in /proc/ksyms\n",
	      prgname);
      exit(1);
   }
   if ((stalled_addr = get_symbol_address(mapf, "STALLED")) == 0) {
      stalled_addr = 0;		/* no biggie */
   }
   fclose(mapf);
   
   for (i = 0; i < NR_CPUS; ++i)
      if (map & (1 << i)) {
	 sprintf(s, "/dev/profile%d", i);
	 if ((proc_fds[n] = open(s, O_RDONLY)) == -1)
	    err_exit(s);
	 sprintf(s, "call_trace.%d", i);
	 if ((out_fds[n++] = creat(s, 0666)) == -1)
	    err_exit(s);
      }
   install_sigint_handler();

   /* We become low priority RT so we can copy the buffers quickly */
   sched_getparam(0, &sched_prio);
   sched_prio.sched_priority = 1;
   sched_setscheduler(0, SCHED_FIFO, &sched_prio);
   
   while (!recv_ctlC) {
      kaddress_t buf[PROF_BACKTRACE_BUFSIZE - 1];
      ssize_t len;

      for (i = 0; i < n; ++i)
	 if ((len = read(proc_fds[i], buf, sizeof buf)) > 0) {
	    buf_reads++;
	    if (len == sizeof buf) ovfl++;
	    if (len > max_read) max_read = len;
	    if (write(out_fds[i], buf, len) != len)
	       err_exit("error writing call backtraces\n");
	 }
      usleep(sleep_msec * 1000);
   }

   /* Go back to normal priority */
   sched_prio.sched_priority = 0;
   sched_setscheduler(0, SCHED_OTHER, &sched_prio);
   
   prof_ioctl(PROF_STOP, 0);
   for (i = 0; i < n; ++i) {
      close(proc_fds[i]);
      close(out_fds[i]);
   }
   if (!outFile)
      outFile = defaultGprofOutFile;
   for (i = 0; i < NR_CPUS; ++i)
      if (map & (1 << i)) {
	 char s1[256];

	 sprintf(s, "call_trace.%d", i);
	 sprintf(s1, "%s.cpu%d", outFile, i);
	 convert_to_gprof_format(s, s1);
#if NOTYET
	 sprintf(s1, "%s.raw.cpu%d", outFile, i);
	 convert_to_gprof_raw_format(s, s1);
#endif
	 remove(s);
      }
   if (ovfl > 0)
      printf("%i possible buffer overflows out of %i buffer reads (max = %i)\n",
	     ovfl, buf_reads, (int) max_read);
}

unsigned int get_uint_arg(const char *s, const char opt)
{
   unsigned int val;
   char *endp;

   if ((val = strtoul(s, &endp, 0)) == 0 || *endp) {
      fprintf(stderr, "%s: -%c takes an unsigned integer\n", prgname, opt);
      exit(1);
   }
   return val;
}
	
int main (int argc, char **argv)
{
   cpu_map_t cpu_map = 0;
   char *freq = NULL, *mode = NULL, *sleep = NULL, *domain = NULL, *pid = NULL;
   char *event = NULL, *mapFile = DEFAULT_MAP, *outFile = NULL;
   int c, status = 0, reset = 0, start = 0, stop = 0, verbose = 0;
   int call_graph = 0, pc_sample = 0;

   prgname = argv[0];
   while ((c = getopt(argc, argv, "a:d:m:c:f:o:p:t:w:beighrsv")) != -1) {
      switch (c) {
	 case 'a': event = optarg;             break;
	 case 'b': start++;                    break;
	 case 'c': cpu_map |= get_cpu(optarg); break;
	 case 'd': domain = optarg;            break;
	 case 'e': stop++;                     break;
	 case 'f': freq = optarg;              break;
	 case 'g': call_graph++;               break;
	 case 'i': pc_sample++;                break;
	 case 'm': mapFile = optarg;           break;
	 case 'o': outFile = optarg;           break;
	 case 'p': pid = optarg;               break;
	 case 'r': reset++;                    break;
	 case 's': status++;                   break;
	 case 't': mode = optarg;              break;
         case 'v': verbose++;                  break;
	 case 'w': sleep = optarg;             break;
	 case 'h':
	 default:  usage();
      }
   }

   if ((ctl_fd = open("/dev/profile", O_RDONLY)) == -1)
      err_exit("/dev/profile");

   if (stop)
   {
      prof_ioctl(PROF_STOP, 0);
      prof_ioctl(PROF_GET_MAPPING, (unsigned long) &memory_map);
   }

   if (reset)
   {
      prof_ioctl(PROF_RESET, 0);
      prof_ioctl(PROF_GET_MAPPING, (unsigned long) &memory_map);
   }

   if (sleep)
      sleep_msec = get_uint_arg(sleep, 'w');

   if (event)
      set_PMC_event(get_uint_arg(event, 'a'));

   if (pid)
      set_prof_pid(strtoul(pid, NULL, 0));
   
   if (mode) {
      if (!strcmp(mode, "pc"))
	 set_prof_mode(PROF_MODE_PC_SAMPLING);
      else if (!strcmp(mode, "acg"))
	 set_prof_mode(PROF_MODE_PC_SAMPLING | PROF_MODE_CALL_GRAPH);
      else if (!strcmp(mode, "ct"))
	 set_prof_mode(PROF_MODE_BACKTRACE);
      else if (!strcmp(mode, "cg"))
	 set_prof_mode(PROF_MODE_CALL_GRAPH);
      else if (!strcmp(mode, "cc"))
	 set_prof_mode(PROF_MODE_CALL_COUNT);
      else if (!strcmp(mode, "scg"))
	 set_prof_mode(PROF_MODE_SCHEDULER_CALL_GRAPH);
      else {
	 fprintf(stderr, "%s: unsupported profiling mode %s\n", prgname, mode);
	 exit(1);
      }
   }

   if (domain) {
      if (!strcmp(domain, "time"))
         set_prof_domain(PROF_DOMAIN_TIME);
      else if (!strcmp(domain, "pmc"))
	 set_prof_domain(PROF_DOMAIN_PERFCTR);
      else {
	 fprintf(stderr, "%s: unsupported profiling domain %s\n", prgname,
		 domain);
	 exit(1);
      }
   }

   if (freq)
      set_prof_freq(get_uint_arg(freq, 'f'));

   if (cpu_map)
      prof_ioctl(PROF_SET_ENABLE_MAP, cpu_map);

   if (start)
      prof_ioctl(PROF_START, 0);

   if (status)
      show_prof_status();

   if (call_graph)
   {
      prof_ioctl(PROF_GET_MAPPING, (unsigned long) &memory_map);
      output_cg_profile(outFile);
   }

   if (pc_sample)
   {
      prof_ioctl(PROF_GET_MAPPING, (unsigned long) &memory_map);
      output_pc_profile(mapFile, outFile, verbose);
   }

   if (start && get_prof_info(PROF_GET_MODE) == PROF_MODE_BACKTRACE)
      collect_call_traces(get_prof_info(PROF_GET_ENABLE_MAP), outFile);

   close(ctl_fd);
}
