#!/usr/bin/perl -w
#
# Copyright (c) 2003 Parallelldatorcentrum
# (Center for Parallel Computers, KTH, Sweden)
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without 
# modification, are permitted provided that the following conditions 
# are met: 
#
# 1. Redistributions of source code must retain the above copyright 
#    notice, this list of conditions and the following disclaimer. 
#
# 2. Redistributions in binary form must reproduce the above copyright 
#    notice, this list of conditions and the following disclaimer in the 
#    documentation and/or other materials provided with the distribution. 
#
# 3. Neither the name of the Institute nor the names of its contributors 
#    may be used to endorse or promote products derived from this software 
#    without specific prior written permission. 
#
# THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND 
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
# ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE 
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 
# SUCH DAMAGE. 
#
# $Header: /afs/pdc.kth.se/home/p/pek/src/perl/RCS/i2prof.pl,v 1.44 2004/07/12 15:22:07 pek Exp $
#
# pek@pdc.kth.se $Date: 2004/07/12 15:22:07 $

# Calculate various statistics from performance counter values
# in the Itanium2 processor

use English;
use strict;
use Getopt::Std;
use File::Basename;

use vars qw($opt_a $opt_c $opt_d $opt_o $opt_s $opt_v $opt_V $opt_D $opt_l $opt_t $opt_p $opt_P $opt_q $opt_L);

my ($pfmon) = "pfmon";

my ($version) = '$Revision: 1.44 $ ' . "\n";
my ($usage) = "Usage : [ -vachpDVLl ] [ -d datafile ] [ -t class ] [ -q metric ] [ -o pfmon-options ] [ -P pfmon ] [ program options ]\n";
my ($usage_long) = "Options :\n" .
    " -v         Print version.\n" .
    " -D	 Debug.\n" .
    " -V	 Verbose.\n" .
    " -a         Use all available questions.\n" .
    " -c	 List event comments.\n" .
    " -l         List counters needed for each statistic.\n" . 
    " -L         List questions.\n" . 
    " -t class   List statistics in the specified class. Currently\n" .
    "            supported classes are \"cache\", \"stall\", \"dstall\",\n" .
    "            \"efficiency\" and \"latency\"\n" .
    " -q metric  Get the data or counter list for \"metric\"\n" .
    " -d dfile   Calculate results from data file.\n" . 
    " -p         Run pfmon if needed. Put program to run and its options\n" .
    "            at the end of the line.\n" .
    " -P pfmon   Specify an alternate pfmon binary to use.\n" .
    " -o args    Extra arguments to pfmon, enclose in \"\"\n" .
    "";

my ($nformat) = "%-45s : %.3f\n";

# Due to the trivial parsing all event names must be surrounded by
# spaces, "(BE_EXE_BUBBLE_GRALL-BE_EXE_BUBBLE_GRGR)" will be parsed as
# a single token which is not what we want.

my (%qs) = (
	    "Loads/store"
	    => "LOADS_RETIRED / STORES_RETIRED",
	    "D-cache stalls" 
	    => "( BE_EXE_BUBBLE_GRALL - BE_EXE_BUBBLE_GRGR + BE_L1D_FPU_BUBBLE_L1D ) / BACK_END_BUBBLE_ALL",
	    "Branch mispredict stalls" 
	    => "( BE_FLUSH_BUBBLE_BRU + ( FE_BUBBLE_BUBBLE + FE_BUBBLE_BRANCH ) * ( BACK_END_BUBBLE_FE / FE_BUBBLE_ALLBUT_IBFULL ) ) / BACK_END_BUBBLE_ALL",
	    "I-cache stalls" 
	    => "( FE_BUBBLE_IMISS ) * ( BACK_END_BUBBLE_FE / FE_BUBBLE_ALLBUT_IBFULL ) / BACK_END_BUBBLE_ALL",
	    "RSE stalls" 
	    => "BE_RSE_BUBBLE_ALL / BACK_END_BUBBLE_ALL",
	    "Front end (instruction decoding) stalls" 
	    => "BACK_END_BUBBLE_FE / BACK_END_BUBBLE_ALL",
	    "FPU stalls" 
	    => "( BE_EXE_BUBBLE_FRALL + BE_L1D_FPU_BUBBLE_FPU ) / BACK_END_BUBBLE_ALL",
	    "Support register dependency stalls"
	    => "BE_EXE_BUBBLE_ARCR_PR_CANCEL_BANK / BACK_END_BUBBLE_ALL",
	    "Integer register dependency stalls"
	    => "BE_EXE_BUBBLE_GRGR / BACK_END_BUBBLE_ALL",
	    "Integer load latency stalls"
	    => "( BE_EXE_BUBBLE_GRALL - BE_EXE_BUBBLE_GRGR ) / BACK_END_BUBBLE_ALL",
	    "Undispersed instructions/cycle"
	    => "( SYLL_NOT_DISPERSED_ALL - SYLL_OVERCOUNT_ALL ) / CPU_CYCLES",
	    "L2 hit rate" 
	    => "1 - L2_MISSES / L2_REFERENCES",
	    "L2D hit rate" 
	    => "1 - ( L2_MISSES - L3_READS_INST_FETCH_ALL ) / L2_DATA_REFERENCES_L2_ALL",
	    "L2 data references"
	    => "L2_DATA_REFERENCES_L2_ALL / L2_REFERENCES",
	    "L3 hit rate" 
	    => "1 - ( L3_MISSES - L3_WRITES_L2_WB_MISS ) / ( L3_REFERENCES - L3_WRITES_L2_WB_ALL )",
	    "L3D hit rate"
	    => "( L3_READS_DATA_READ_HIT + L3_WRITES_DATA_WRITE_HIT ) / ( L3_READS_DATA_READ_ALL + L3_WRITES_DATA_WRITE_ALL )",
	    "L3 data references"
	    => "( L3_READS_DATA_READ_ALL + L3_WRITES_ALL_ALL ) / L3_REFERENCES",
	    "Instructions/cycle"
	    => "IA64_INST_RETIRED_THIS / CPU_CYCLES",
	    "NOPs/instruction"
	    => "NOPS_RETIRED / IA64_INST_RETIRED_THIS",
	    "NOPs/cycle"
	    => "NOPS_RETIRED / CPU_CYCLES",
	    "Useful instructions/cycle"
	    => "( IA64_INST_RETIRED_THIS - NOPS_RETIRED ) / CPU_CYCLES",
	    "Cycles/load"
	    => "CPU_CYCLES / LOADS_RETIRED",
	    "Cycles/L2 data miss"
	    => "CPU_CYCLES / L2_MISSES",
	    "Cycles/L3 data miss"
	    => "CPU_CYCLES / L3_MISSES",
	    "Total stalls"
	    => "BACK_END_BUBBLE_ALL / CPU_CYCLES",
	    "L1I hit rate"
	    => "( L1I_READS - ( L2_INST_DEMAND_READS + L2_INST_PREFETCHES )) / L1I_READS",
	    "L1I prefetch hit rate"
	    => "( L1I_PREFETCHES - L2_INST_PREFETCHES ) / L1I_PREFETCHES",
	    "L2I hit rate"
	    => "(( L2_INST_DEMAND_READS + L2_INST_PREFETCHES ) - L3_READS_INST_FETCH_ALL ) / ( L2_INST_DEMAND_READS + L2_INST_PREFETCHES )",
	    "L1D hit rate"
	    => "1 - ( L1D_READ_MISSES_ALL / L1D_READS_SET0 )",
	    "FLOP/cycle"
	    => "FP_OPS_RETIRED / CPU_CYCLES",
            "TLB misses per load/store"
            => "( ITLB_MISSES_FETCH_L2ITLB + L2DTLB_MISSES ) / ( LOADS_RETIRED + STORES_RETIRED )",
	    "4+ cycle latency loads"
	    => "DATA_EAR_CACHE_LAT4 / LOADS_RETIRED",
	    "8+ cycle latency loads"
	    => "DATA_EAR_CACHE_LAT8 / LOADS_RETIRED",
	    "16+ cycle latency loads"
	    => "DATA_EAR_CACHE_LAT16 / LOADS_RETIRED",
	    "64+ cycle latency loads"
	    => "DATA_EAR_CACHE_LAT64 / LOADS_RETIRED",
	    "256+ cycle latency loads"
	    => "DATA_EAR_CACHE_LAT256 / LOADS_RETIRED",
	    "1024+ cycle latency loads"
	    => "DATA_EAR_CACHE_LAT1024 / LOADS_RETIRED",
	    "Virtual memory stalls"
	    => "( BE_L1D_FPU_BUBBLE_L1D_TLB + BE_L1D_FPU_BUBBLE_L1D_HPW ) / BACK_END_BUBBLE_ALL",
	    "L2 capacity stalls"
	    => "BE_L1D_FPU_BUBBLE_L1D_L2BPRESS / BACK_END_BUBBLE_ALL",
	    "Failed speculative load penalty stalls"
	    => "( BE_L1D_FPU_BUBBLE_L1D_LDCHK + BE_L1D_FPU_BUBBLE_L1D_NAT + BE_L1D_FPU_BUBBLE_L1D_NATCONF ) / BACK_END_BUBBLE_ALL",
	    "L2 cancels ratio"
	    => "L2_OZQ_CANCELS0_ANY / L2_REFERENCES",
	    "L2 bank conflict ratio"
	    => "L2_OZQ_CANCELS1_BANK_CONF / L2_REFERENCES",
	    "L2 recirculation stalls"
	    => "( BE_L1D_FPU_BUBBLE_L1D_DCURECIR + BE_L1D_FPU_BUBBLE_L1D_STBUFRECIR ) / BACK_END_BUBBLE_ALL",
	    "Store related stalls"
	    => "( BE_L1D_FPU_BUBBLE_L1D_FULLSTBUF + BE_L1D_FPU_BUBBLE_L1D_FILLCONF ) / BACK_END_BUBBLE_ALL",
	    "Main memory bandwidth used"
	    => "( BUS_MEMORY_EQ_128BYTE_SELF * 128 + BUS_MEMORY_LT_128BYTE_SELF * 64 ) / CPU_CYCLES",
	    "Main memory loads/cycle"
	    => "L3_READS_ALL_MISS / CPU_CYCLES",
	    "Loads"
	    => "LOADS_RETIRED / IA64_INST_RETIRED_THIS",
	    "Stores"
	    => "STORES_RETIRED / IA64_INST_RETIRED_THIS",
	    "Branches"
	    => "BR_MISPRED_DETAIL_ALL_ALL_PRED / IA64_INST_RETIRED_THIS",
	    "L1D hits"
	    => "( L1D_READS_SET0 - L1D_READ_MISSES_ALL ) / ( LOADS_RETIRED + STORES_RETIRED )",
	    "L2D hits" 
	    => "( L2_DATA_REFERENCES_L2_ALL - ( L2_MISSES - L3_READS_INST_FETCH_ALL )) / ( LOADS_RETIRED + STORES_RETIRED )",
	    "L3D hits" 
	    => "( L3_READS_DATA_READ_HIT + L3_WRITES_DATA_WRITE_HIT ) /  ( LOADS_RETIRED + STORES_RETIRED )",
	    "Main memory loads/stores"
	    => "( L3_READS_DATA_READ_MISS + L3_WRITES_DATA_WRITE_MISS ) / ( LOADS_RETIRED + STORES_RETIRED )",
	    );

my (%comments) = (
		  "L1D hits"
		  => "Proportion of L1 hits to the total number of loads/stores.\n" .
		  " Store hits are ignored since they're always passed through to L2.\n",
		  "L2D hits"
		  => "Proportion of L2 hits to the total number of loads/stores.\n",
		  "L3D hits"
		  => "Proportion of L2 hits to the total number of loads/stores.\n",
		  "Main memory loads/stores"
		  => "Proportion of loads/stores that go to main memory.\n",
		  "L3 hit rate"
		  => "L3 cache hit rate, excluding L2 write-backs\n",
		  "L3D hit rate"
		  => "L3D cache hit rate, excluding L2 write-backs\n",
		  "Branch mispredict stalls"
		  => " See \n" .
		  "\"A Methodology for using Itanium 2 Performance Counters for Bottleneck Analysis\"\n" .
		  " by Sverre Jarp, section 2.2.5.\n",
		  "L2 cancels ratio"
		  => "Ratio of cancelled L2 requests to total number of L2 references.\n" .
		  " This ratio may not be \"accurate\" since a request could\n" .
		  " potentially be cancelled several times. This event is mostly\n" .
		  " intended as a reference point for the relative impact of\n" .
		  " \"L2 bank conflict ratio\"\n",
		  "L2 bank conflict ratio"
		  => "Proportion of L2 references that are re-issued due to\n" .
		  " bank conflicts in the L2 data array.\n",
		  "I-cache stalls"
		  => "Proportion of I-cache miss stall cycles to the total\n" .
		  " number of stall cycles. See \n" .
		  "\"A Methodology for using Itanium 2 Performance Counters for Bottleneck Analysis\"\n" .
		  " by Sverre Jarp, section 2.2.5.\n",
		  "Undispersed instructions/cycle"
		  => "Number of instructions (syllables)/cycle not dispersed\n" .
		  " for reasons other than stalls (stop bits, oversubscription,\n" .
		  " invalid bundles/templates and MLI bundles).\n",
		  
		  "Main memory bandwidth used"
		  => "Average memory bandwidth used in bytes/cycle.\n" .
		  " This statistic assumes that all non-full cache line\n" .
		  " transactions are 64 bytes, so it may overstate the actual\n" .
		  " bandwidth used in some cases. I/O bandwidth is not counted.\n",
		  "Store related stalls"
		  => "Proportion of main pipeline stall cycles due to\n" .
		  " a lack of resources for store operations.\n",
		  "L2 recirculation stalls"
		  => "Proportions of main pipeline stall cycles due to L2 \n" .
		  " misses in cache lines already being fetched to total \n" .
		  " number of stall cycles.\n",
		  "L2 capacity stalls"
		  => "Proportion of main pipeline stalls due to L2 \n" .
		  " capacity limits to total number of stall cycles.\n" .
		  " This equates to cache misses that stall the pipeline\n" .
		  " since the L2 can no longer keep being non-blocking.\n",
		  "Failed speculative load penalty stalls"
		  => "Stalls caused by NAT values and load check conflicts.\n",
		  "Virtual memory stalls"
		  => "Proportion of VM stalls (L2TLB-L1TLB transfer or \n" .
		  " HPW stalls) to total number of stall cycles.\n",
		  "4+ cycle latency loads"
		  => "Ratio of data loads with a latency longer than \n" .
		  " 4 cycles to total loads.\n",
		  "8+ cycle latency loads"
		  => "Ratio of data loads with a latency longer than \n" .
		  " 8 cycles to total loads.\n",
		  "16+ cycle latency loads"
		  => "Ratio of data loads with a latency longer than \n" .
		  " 16 cycles to total loads.\n",
		  "64+ cycle latency loads"
		  => "Ratio of data loads with a latency longer than \n" .
		  " 64 cycles to total loads.\n",
		  "256+ cycle latency loads"
		  => "Ratio of data loads with a latency longer than \n" .
		  " 256 cycles to total loads.\n",
		  "1024+ cycle latency loads"
		  => "Ratio of data loads with a latency longer than \n" .
		  " 1024 cycles to total loads.\n",
		  "TLB misses per load/store" 
		  => "Counts \"real\" TLB misses, i.e. misses that cause\n" .
		  " a VHPT walk or a TLB miss trap to the OS. Misses in the\n" .
		  " L1 TLBs are not counted.",
		  "L1I hit rate"
		  => "Counts hits caused by both demand fetches and prefetch.\n",
		  "L2I hit rate",
		  => "Counts hits caused by both demand fetches and prefetch.\n",
		  "FLOP/cycle"
		  => "Number of floating point operations per cycle.\n" .
		  " Counts logical operations, a FMA is counted as 2 ops\n" .
		  " and a FPMA is counted as 4 ops.\n",
		  "L2 data references"
		  => "Proportion of data references to all references in L2",
		  "L3 data references"
		  => "Proportion of data references to all references in L3",
		  "Instructions/cycle"
		  => "Only counts IA-64 instructions actually executed.\n" .
		  " IA-32 instructions and IA-64 instruction predicated\n" .
		  " off are not considered.\n",
		  "Total stalls"
		  => "Proportion of stall cycles to total number of cycles.\n",
		  "Support register dependency stalls" 
		  => "Stalls caused by bank switches, cancelled loads and\n" .
		  " dependencies on application, control and predicate\n" .
		  " registers.\n",
		  "Integer register dependency stalls"
		  => "Main execution pipeline stalls caused by register-register\n" .
		  " dependencies (scoreboarding). (Ratio to all stalls)\n",
		  "Integer load latency stalls"
		  => "Main execution pipeline stalls caused by load instruction\n" .
		  " latencies. (Ratio to all stalls)\n",
		  "RSE stalls"
		  => "Stalls due to the register stack engine (register spills\n" .
		  " and fills).\n",
	          "L2 data references"
		  => "Ratio of data references to all references in L2.\n",
	          "L3 data references"
		  => "Ratio of data references to all references in L3.\n",
		  "Useful instructions/cycle"
		  => "Ratio of executed instructions less NOPs to total number\n" .
		  " of cycles.\n",
		  "FPU stalls"
		  => "Ratio of stalls caused by load latencies and\n" .
		  " register-register dependencies in the FPU to the\n" .
		  " total number of stall cycles.\n"
);

my (@cache_qs) = ("L1I hit rate",
		  "L1I prefetch hit rate",
		  "L2I hit rate",
		  "L1D hit rate",
		  "L2 hit rate",
		  "L2D hit rate",
	          "L2 data references",
	          "L3 hit rate",
		  "L3D hit rate",
		  "L3 data references",
		  "L2 bank conflict ratio"
		  );
my (@stall_qs) = ("D-cache stalls",
		  "Branch mispredict stalls",
		  "I-cache stalls",
		  "FPU stalls",
		  "RSE stalls",
		  "Integer register dependency stalls",
		  "Support register dependency stalls"
		  );
my (@dstall_qs) = ("L2 capacity stalls",
		   "Failed speculative load penalty stalls",
		   "Integer load latency stalls",
		   "L2 recirculation stalls",
		   "Store related stalls",
		   "Virtual memory stalls"
		   );
my (@effic_qs) = ("Useful instructions/cycle",
		  "FLOP/cycle",
		  "NOPs/cycle",
		  "Instructions/cycle",
		  "Main memory bandwidth used",
		  "NOPs/instruction",
		  "Undispersed instructions/cycle",
		  "Loads/store",
		  "Total stalls");

my (@lat_qs) = ("4+ cycle latency loads",
	    "8+ cycle latency loads",
	    "16+ cycle latency loads",
	    "64+ cycle latency loads",
	    "256+ cycle latency loads",
	    "1024+ cycle latency loads"
	    );

my (@mem_qs) = ("L1D hits",
		"L2D hits",
		"L3D hits",
		"Main memory loads/stores"
		);

# Set "virtual" sets for events where the normal mechanism fails
my (%confl) = ("DATA_EAR_CACHE_LAT4" => 1,
	       "DATA_EAR_CACHE_LAT8" => 2,
	       "DATA_EAR_CACHE_LAT16" => 3,
	       "DATA_EAR_CACHE_LAT64" => 4,
	       "DATA_EAR_CACHE_LAT256" => 5,
	       "DATA_EAR_CACHE_LAT1024" => 6,
	       "L2_OZQ_CANCELS1_BANK_CONF" => 7,
	       "L2_OZQ_CANCELS1_CANC_L2M_ST" => 7,
	       "L2_OZQ_CANCELS1_CCV" => 7,
	       "L2_OZQ_CANCELS1_ECC" => 7,
	       "L2_OZQ_CANCELS1_HPW_IFETCH_CONF" => 7,
	       "L2_OZQ_CANCELS1_L1DF_L2M" => 7,
	       "L2_OZQ_CANCELS1_L1_FILL_CONF" => 7,
	       "L2_OZQ_CANCELS1_L2A_ST_MAT" => 7,
	       "L2_OZQ_CANCELS1_L2D_ST_MAT" => 7,
	       "L2_OZQ_CANCELS1_L2M_ST_MAT" => 7,
	       "L2_OZQ_CANCELS1_MFA" => 7,
	       "L2_OZQ_CANCELS1_REL" => 7,
	       "L2_OZQ_CANCELS1_SEM" => 7,
	       "L2_OZQ_CANCELS1_ST_FILL_CONF" => 7,
	       "L2_OZQ_CANCELS1_SYNC" => 7,
	       "L2_OZQ_CANCELS2_ACQ" => 8,
	       "L2_OZQ_CANCELS2_CANC_L2C_ST" => 8,
	       "L2_OZQ_CANCELS2_CANC_L2D_ST" => 8,
	       "L2_OZQ_CANCELS2_DIDNT_RECIRC" => 8,
	       "L2_OZQ_CANCELS2_D_IFET" => 8,
	       "L2_OZQ_CANCELS2_L2C_ST_MAT" => 8,
	       "L2_OZQ_CANCELS2_L2FILL_ST_CONF" => 8,
	       "L2_OZQ_CANCELS2_OVER_SUB" => 8,
	       "L2_OZQ_CANCELS2_OZ_DATA_CONF" => 8,
	       "L2_OZQ_CANCELS2_READ_WB_CONF" => 8,
	       "L2_OZQ_CANCELS2_RECIRC_OVER_SUB" => 8,
	       "L2_OZQ_CANCELS2_SCRUB" => 8,
	       "L2_OZQ_CANCELS2_WEIRD" => 8
	       );


my (%events);
my (@packlist);
my ($num_slots) = 4; # Number of events that can be counted at once
my ($dfile) = "";
# Temps
my (@l);
my ($str);
my (@qst);

my ($verbose) = 0;
my ($debug) = 0;

sub verbose(@) {
    if ($verbose) {
	print "@_";
    }
}

sub debug(@) {
    if ($debug) {
	print "Debug : @_";
    }
}

sub run (@) {
    my ($rc);

    debug " @_\n"; $rc = 0;
    $rc = system (@_);
    return $rc >> 8;
}

sub here_p {
    my ($cmd) = shift;

    foreach (split /:/, $ENV{PATH}) {
	if ( -x "$ARG/$cmd") {
	    return 1;
	}
    }
    return 0;
}

sub handle_qstr($) {
    my (@tokens) = split ' ', $ARG[0];
    my ($cmd) = "";
    my ($token);
    my ($res);

    foreach $token (@tokens) {
	if ($token =~ /^\w+$/) {
	    if ($token =~ /^\d+$/) {
		$cmd = $cmd . " $token ";
		next;
	    }
	    if (! defined $events{$token}) {
		verbose("Question lacks counter data for $token.\n");
		return;
	    }
	    $cmd = $cmd . " $events{$token} ";
	}
	else {
	    $cmd = $cmd . " $token ";
	}
    }
    debug("Cmd : $cmd\n");
    $res = eval $cmd or die "Derived event calculation failed!\n" ;
    debug("Results : $res\n");
    return $res;
}

sub handle_q($) {
    return handle_qstr($qs{$ARG[0]});
}

sub get_events($) {
    my ($q) = shift;
    my ($str) = $qs{$q};
    my (@tokens) = split ' ', $str;
    my ($token);
    my (@res);
    
    debug("$str\n");
    foreach $token (@tokens) {
	debug("Token $token.\n");
	if ($token =~ /^\w+$/) {
	    if ($token =~ /^\d+$/) {
		debug("Nontoken $token\n");
	    }
	    else {
		push @res, $token;
	    }
	}
    }
    return @res;
}

# Print the counters needed for question
sub needed_events($) {
    my ($q) = shift;
    my (@events) = get_events($q);
    
    print " ";
    foreach $str (@events) {
	print "$str,";
    }
    print "\n";
}
       
# Load hash table from file
sub load_counter_data($) {
    my ($fname) = shift;
    my (@l);

    if ( -r $fname ) {
	@l = `cat $fname`;
    }
    foreach $str (@l) {
	if ($str =~ /^\s+(\d+)\s+(\S+)\s*.*$/) {
	    debug("Event $2 value $1\n");
	    $events{$2} = $1;
	}
    }
}

# Return number of unique events
sub build_event_set_list {
    my (@evs) = @ARG;
    my ($group, $set);
    my ($tmp);
    my (%added);
    my ($ret) = 0;

    @packlist = ();
    foreach $str (@evs) {
	$set = 0; $group = -1;
	open PFM, "$pfmon -i \"\^$str\$\"|" || return "pfmon invokation on event $str failed.\n";
	foreach $tmp (<PFM>) {
	    if ($tmp =~ /^Set\s+\:\s+(\d)$/) {
		$set = $1;
		debug("Event $str is in set $1\n");
	    }
	    elsif ($group == -1) {
		if ($tmp =~ /^Group  : None\s*$/) {
		    $group = 0;
		}
		elsif ($tmp =~ /^Group  : L1D Cache\s*$/) {
		    $group = 1;
		}
		elsif ($tmp =~ /^Group  : L2 Cache\s*$/) {
		    $group = 2;
		}
		# Virtual group for D-EAR events
		elsif ($tmp =~ /^EAR    : Data \(Cache Mode\)\s*$/) {
		    debug("Event $str is EAR group 3\n");
		    $group = 3;
		}
	    }
	}
	close PFM;
	if (defined $confl{$str}) {
	    $set = $confl{$str};
	}
	if (! defined $added{$str}) {
	    debug "Adding event $str to set $set, group $group\n";
	    push @{ $packlist[$group][$set] }, $str;
	    $added{$str} = 1;
	    $ret++;
	}
    }
    return $ret;
}

# Pack events into a, hopefully small, number of pfmon invocations
sub pack_events {
    my ($cnt);
    my (@slots);
    my (@res);
    my ($i, $j, $minlen, $minindex, $set, $group);

    $cnt = build_event_set_list(@ARG);
    
    while ($cnt > 0) {
	$i = 0;
	# Pick an event from the shortest set in group 3, 2,1,0
	for ($group = 3; $group >= 0; $group--) {
	    $minlen = $cnt + 1; $minindex = 0;
	    debug "Group $group has $#{ $packlist[$group] } sets\n";
	    if ($#{ $packlist[$group] } < 0) {
		next;
	    }
	    for ($set = 0; $set <= $#{ $packlist[$group] }; $set++) {
		debug("Group $group, set $set len " . ($#{ $packlist[$group][$set] } + 1) . ", minlen $minlen\n");
		if (defined $packlist[$group][$set] &&
		    $#{ $packlist[$group][$set] } >= 0 && 
		    $#{ $packlist[$group][$set] } + 1 < $minlen) {
		    $minlen = $#{ $packlist[$group][$set] } + 1;
		    $minindex = $set;
   		}
	    }
	    if ($#{ $packlist[$group][$minindex] } >= 0) {
		for (; $i < $num_slots && $minlen > 0; $i++, $minlen--) {
		    $str = pop @{ $packlist[$group][$minindex] };
		    debug "Popped $str from group $group, set $minindex\n";
		    push @slots, $str;
		    $cnt--;
		}
	    }
	    if ($cnt == 0) {
		$i = $num_slots;
		last;
	    }
	}
	$str = join ',', @slots;
	debug "Finished slot \"$str\", count is $cnt\n";
	push @res, "$str";
	@slots = ();
    }

    return @res;
}


#####################################################################
# Execution starts here
#####################################################################
if (!getopts('avcDVlLpd:o:P:q:s:t:')) {
    print $usage;
    print $usage_long;
    exit;
}

if ($opt_v) {
    print $version;
    exit;
}
if ($opt_V) {
    $verbose = 1;
}
if ($opt_D) {
    $debug = 1;
}
if ($opt_a) {
    @qst = keys %qs;
}
if ($opt_P) {
    if (! -f $opt_P) {
	print "Invalid argument to -P option, must be a file.\n";
	exit 1;
    }
    $pfmon = $opt_P;
}
if ($opt_t) { # Only show a particular class of statistic
    if ($opt_t eq "cache") {
	@qst = @cache_qs;
    }
    elsif ($opt_t eq "stall") {
	@qst = @stall_qs;
    }
    elsif ($opt_t eq "latency") {
	@qst = @lat_qs;
    }
    elsif ($opt_t eq "dstall") {
	@qst = @dstall_qs;
    }
    elsif ($opt_t eq "efficiency") {
	@qst = @effic_qs;
    }
    elsif ($opt_t eq "memory") {
	@qst = @mem_qs;
    }
    else {
	print STDERR "No such class \"$opt_t\"\n";
	exit;
    }
}

if ($opt_q) {
    my ($q);

    push @qst, split ',', "$opt_q";
    # Check that all q's are valid
    foreach $q (@qst) {
	if (! defined $qs{$q}) {
	    print "Invalid metric \"$q\"\n";
	    exit;
	}
    }
}
if ($opt_L) { # List available questions
    my ($q);

    if ($#qst < 0) {
	foreach $q (keys %qs) {
	    print "\"$q\"\n";
	}
    }
    else {
	foreach $q (@qst) {
	    print "\"$q\"\n";
	}
    }
    exit;
}
if ($opt_l) { # List needed counters
    my ($q);

    foreach $q (@qst) {
	print "\"$q\"\n";
	needed_events($q);
    }		    

    exit;
}

if ($opt_c) { # List comments
    my ($q);

    foreach $q (@qst) {
	if (defined $comments{$q}) {
	    print "$q :\n";
	    print " $comments{$q}\n";
	}
    }
    
    if (! defined $opt_d) {
	exit;
    }
}
if ($opt_d) { 
    $dfile = $opt_d;
}
elsif ($opt_p) {
    $dfile = "/tmp/" . basename($PROGRAM_NAME) . ".$PID";
}
if ($dfile ne "") {
    my ($q);
    my ($res, $sum);
    my (@tmp, @elist);
    my ($extra_args) = "";
    
    if ($opt_o) {
	$extra_args = $opt_o;
    }
    load_counter_data($dfile);
    if ($opt_p) { # Run pfmon if needed
	if ($#ARGV < 0) {
	    print "You need to specify a program to run.\n";
	    exit;
	}
	foreach $q (@qst) {
	    @tmp = get_events($q);
	    push @elist, @tmp; 
	}
	@tmp = ();
	foreach $str (@elist) {
	    if (!defined $events{$str}) {
		push @tmp, $str;
	    }
	}
	if ($#tmp >= 0) {
#	    print "Tmp : @tmp\n";
	    if ((! -x $pfmon) && ! here_p($pfmon)) {
		print "Could not find the pfmon binary in your path.\n";
		exit 1;
	    }
	    @elist = pack_events(@tmp);
	    foreach $str (@elist) {
		run("$pfmon $extra_args --append --outfile=$dfile --events=$str @ARGV");
	    }
	    load_counter_data($dfile);
	}
	else {
	    print "No need to run pfmon, all needed counts present in $dfile.\n";
	}
    }
    
    if ($opt_t && $opt_t eq "stall") {
	print "Stall numbers are the ratio to the total number of stall cycles.\n";
    }
    $sum = 0.0;
    foreach $q (@qst) {
	$res = handle_q($q);
	if (defined $res) {
	    printf $nformat, $q, $res;
	    $sum += $res;
	}
    }
    if ($opt_t && ($opt_t eq "stall" || $opt_t eq "memory")) {
	printf $nformat, "Sum", $sum;
    }
    if (! defined $opt_d) {
	unlink $dfile;
    }
}
else {
    print "No data.\n";
    print $usage;
}
exit;