1 files changed, 737 insertions, 0 deletions
diff --git a/lib/DBI/ProfileData.pm b/lib/DBI/ProfileData.pm
new file mode 100644
index 0000000..b2db087
--- /dev/null
+++ b/lib/DBI/ProfileData.pm
@@ -0,0 +1,737 @@
+package DBI::ProfileData;
+use strict;
+
+=head1 NAME
+
+DBI::ProfileData - manipulate DBI::ProfileDumper data dumps
+
+=head1 SYNOPSIS
+
+The easiest way to use this module is through the dbiprof frontend
+(see L<dbiprof> for details):
+
+  dbiprof --number 15 --sort count
+
+This module can also be used to roll your own profile analysis:
+
+  # load data from dbi.prof
+  $prof = DBI::ProfileData->new(File => "dbi.prof");
+
+  # get a count of the records (unique paths) in the data set
+  $count = $prof->count();
+
+  # sort by longest overall time
+  $prof->sort(field => "longest");
+
+  # sort by longest overall time, least to greatest
+  $prof->sort(field => "longest", reverse => 1);
+
+  # exclude records with key2 eq 'disconnect'
+  $prof->exclude(key2 => 'disconnect');
+
+  # exclude records with key1 matching /^UPDATE/i
+  $prof->exclude(key1 => qr/^UPDATE/i);
+
+  # remove all records except those where key1 matches /^SELECT/i
+  $prof->match(key1 => qr/^SELECT/i);
+
+  # produce a formatted report with the given number of items
+  $report = $prof->report(number => 10); 
+
+  # clone the profile data set
+  $clone = $prof->clone();
+
+  # get access to hash of header values
+  $header = $prof->header();
+
+  # get access to sorted array of nodes
+  $nodes = $prof->nodes();
+
+  # format a single node in the same style as report()
+  $text = $prof->format($nodes->[0]);
+
+  # get access to Data hash in DBI::Profile format
+  $Data = $prof->Data();
+
+=head1 DESCRIPTION
+
+This module offers the ability to read, manipulate and format
+DBI::ProfileDumper profile data.  
+
+Conceptually, a profile consists of a series of records, or nodes,
+each of each has a set of statistics and set of keys.  Each record
+must have a unique set of keys, but there is no requirement that every
+record have the same number of keys.
+
+=head1 METHODS
+
+The following methods are supported by DBI::ProfileData objects.
+
+=cut
+
+
+our $VERSION = sprintf("2.%06d", q$Revision: 10007 $ =~ /(\d+)/o);
+
+use Carp qw(croak);
+use Symbol;
+use Fcntl qw(:flock);
+
+use DBI::Profile qw(dbi_profile_merge);
+
+# some constants for use with node data arrays
+sub COUNT     () { 0 };
+sub TOTAL     () { 1 };
+sub FIRST     () { 2 };
+sub SHORTEST  () { 3 };
+sub LONGEST   () { 4 };
+sub FIRST_AT  () { 5 };
+sub LAST_AT   () { 6 };
+sub PATH      () { 7 };
+
+
+my $HAS_FLOCK = (defined $ENV{DBI_PROFILE_FLOCK})
+    ? $ENV{DBI_PROFILE_FLOCK}
+    : do { local $@; eval { flock STDOUT, 0; 1 } };
+
+
+=head2 $prof = DBI::ProfileData->new(File => "dbi.prof")
+
+=head2 $prof = DBI::ProfileData->new(File => "dbi.prof", Filter => sub { ... })
+
+=head2 $prof = DBI::ProfileData->new(Files => [ "dbi.prof.1", "dbi.prof.2" ])
+
+Creates a a new DBI::ProfileData object.  Takes either a single file
+through the File option or a list of Files in an array ref.  If
+multiple files are specified then the header data from the first file
+is used.
+
+=head3 Files
+
+Reference to an array of file names to read.
+
+=head3 File
+
+Name of file to read. Takes precedence over C<Files>.
+
+=head3 DeleteFiles
+
+If true, the files are deleted after being read.
+
+Actually the files are renamed with a C.deleteme> suffix before being read,
+and then, after reading all the files, they're all deleted together.
+
+The files are locked while being read which, combined with the rename, makes it
+safe to 'consume' files that are still being generated by L<DBI::ProfileDumper>.
+
+=head3 Filter
+
+The C<Filter> parameter can be used to supply a code reference that can
+manipulate the profile data as it is being read. This is most useful for
+editing SQL statements so that slightly different statements in the raw data
+will be merged and aggregated in the loaded data. For example:
+
+  Filter => sub {
+      my ($path_ref, $data_ref) = @_;
+      s/foo = '.*?'/foo = '...'/ for @$path_ref;
+  }
+
+Here's an example that performs some normalization on the SQL. It converts all
+numbers to C<N> and all quoted strings to C<S>.  It can also convert digits to
+N within names. Finally, it summarizes long "IN (...)" clauses.
+
+It's aggressive and simplistic, but it's often sufficient, and serves as an
+example that you can tailor to suit your own needs:
+
+  Filter => sub {
+      my ($path_ref, $data_ref) = @_;
+      local $_ = $path_ref->[0]; # whichever element contains the SQL Statement
+      s/\b\d+\b/N/g;             # 42 -> N
+      s/\b0x[0-9A-Fa-f]+\b/N/g;  # 0xFE -> N
+      s/'.*?'/'S'/g;             # single quoted strings (doesn't handle escapes)
+      s/".*?"/"S"/g;             # double quoted strings (doesn't handle escapes)
+      # convert names like log_20001231 into log_NNNNNNNN, controlled by $opt{n}
+      s/([a-z_]+)(\d{$opt{n},})/$1.('N' x length($2))/ieg if $opt{n};
+      # abbreviate massive "in (...)" statements and similar
+      s!(([NS],){100,})!sprintf("$2,{repeated %d times}",length($1)/2)!eg;
+  }
+
+It's often better to perform this kinds of normalization in the DBI while the
+data is being collected, to avoid too much memory being used by storing profile
+data for many different SQL statement. See L<DBI::Profile>.
+
+=cut
+
+sub new {
+    my $pkg = shift;
+    my $self = {                
+                Files        => [ "dbi.prof" ],
+		Filter       => undef,
+                DeleteFiles  => 0,
+                LockFile     => $HAS_FLOCK,
+                _header      => {},
+                _nodes       => [],
+                _node_lookup => {},
+                _sort        => 'none',
+                @_
+               };
+    bless $self, $pkg;
+    
+    # File (singular) overrides Files (plural)
+    $self->{Files} = [ $self->{File} ] if exists $self->{File};
+
+    $self->_read_files();
+    return $self;
+}
+
+# read files into _header and _nodes
+sub _read_files {
+    my $self = shift;
+    my $files  = $self->{Files};
+    my $read_header = 0;
+    my @files_to_delete;
+  
+    my $fh = gensym;
+    foreach (@$files) {
+        my $filename = $_;
+
+        if ($self->{DeleteFiles}) {
+            my $newfilename = $filename . ".deleteme";
+	    if ($^O eq 'VMS') {
+		# VMS default filesystem can only have one period
+		$newfilename = $filename . 'deleteme';
+	    }
+            # will clobber an existing $newfilename
+            rename($filename, $newfilename)
+                or croak "Can't rename($filename, $newfilename): $!";
+	    # On a versioned filesystem we want old versions to be removed
+	    1 while (unlink $filename);
+            $filename = $newfilename;
+        }
+
+        open($fh, "<", $filename)
+          or croak("Unable to read profile file '$filename': $!");
+
+        # lock the file in case it's still being written to
+        # (we'll be foced to wait till the write is complete)
+        flock($fh, LOCK_SH) if $self->{LockFile};
+
+        if (-s $fh) {   # not empty
+            $self->_read_header($fh, $filename, $read_header ? 0 : 1);
+            $read_header = 1;
+            $self->_read_body($fh, $filename);
+        }
+        close($fh); # and release lock
+        
+        push @files_to_delete, $filename
+            if $self->{DeleteFiles};
+    }
+    for (@files_to_delete){
+	# for versioned file systems
+	1 while (unlink $_);
+	if(-e $_){
+	    warn "Can't delete '$_': $!";
+	}
+    }
+    
+    # discard node_lookup now that all files are read
+    delete $self->{_node_lookup};
+}
+
+# read the header from the given $fh named $filename.  Discards the
+# data unless $keep.
+sub _read_header {
+    my ($self, $fh, $filename, $keep) = @_;
+
+    # get profiler module id
+    my $first = <$fh>;
+    chomp $first;
+    $self->{_profiler} = $first if $keep;
+
+    # collect variables from the header
+    local $_;
+    while (<$fh>) {
+        chomp;
+        last unless length $_;
+        /^(\S+)\s*=\s*(.*)/
+          or croak("Syntax error in header in $filename line $.: $_");
+        # XXX should compare new with existing (from previous file)
+        # and warn if they differ (diferent program or path)
+        $self->{_header}{$1} = unescape_key($2) if $keep;
+    }
+}
+
+
+sub unescape_key {  # inverse of escape_key() in DBI::ProfileDumper
+    local $_ = shift;
+    s/(?<!\\)\\n/\n/g; # expand \n, unless it's a \\n
+    s/(?<!\\)\\r/\r/g; # expand \r, unless it's a \\r
+    s/\\\\/\\/g;       # \\ to \
+    return $_;
+}
+
+
+# reads the body of the profile data
+sub _read_body {
+    my ($self, $fh, $filename) = @_;
+    my $nodes = $self->{_nodes};
+    my $lookup = $self->{_node_lookup};
+    my $filter = $self->{Filter};
+
+    # build up node array
+    my @path = ("");
+    my (@data, $path_key);
+    local $_;
+    while (<$fh>) {
+        chomp;
+        if (/^\+\s+(\d+)\s?(.*)/) {
+            # it's a key
+            my ($key, $index) = ($2, $1 - 1);
+
+            $#path = $index;      # truncate path to new length
+            $path[$index] = unescape_key($key); # place new key at end
+
+        }
+	elsif (s/^=\s+//) {
+            # it's data - file in the node array with the path in index 0
+	    # (the optional minus is to make it more robust against systems
+	    # with unstable high-res clocks - typically due to poor NTP config
+	    # of kernel SMP behaviour, i.e. min time may be -0.000008))
+
+            @data = split / /, $_;
+
+            # corrupt data?
+            croak("Invalid number of fields in $filename line $.: $_")
+                unless @data == 7;
+            croak("Invalid leaf node characters $filename line $.: $_")
+                unless m/^[-+ 0-9eE\.]+$/;
+
+	    # hook to enable pre-processing of the data - such as mangling SQL
+	    # so that slightly different statements get treated as the same
+	    # and so merged in the results
+	    $filter->(\@path, \@data) if $filter;
+
+            # elements of @path can't have NULLs in them, so this
+            # forms a unique string per @path.  If there's some way I
+            # can get this without arbitrarily stripping out a
+            # character I'd be happy to hear it!
+            $path_key = join("\0",@path);
+
+            # look for previous entry
+            if (exists $lookup->{$path_key}) {
+                # merge in the new data
+		dbi_profile_merge($nodes->[$lookup->{$path_key}], \@data);
+            } else {
+                # insert a new node - nodes are arrays with data in 0-6
+                # and path data after that
+                push(@$nodes, [ @data, @path ]);
+
+                # record node in %seen
+                $lookup->{$path_key} = $#$nodes;
+            }
+        }
+	else {
+            croak("Invalid line type syntax error in $filename line $.: $_");
+	}
+    }
+}
+
+
+
+=head2 $copy = $prof->clone();
+
+Clone a profile data set creating a new object.
+
+=cut
+
+sub clone {
+    my $self = shift;
+
+    # start with a simple copy
+    my $clone = bless { %$self }, ref($self);
+
+    # deep copy nodes
+    $clone->{_nodes}  = [ map { [ @$_ ] } @{$self->{_nodes}} ];
+
+    # deep copy header
+    $clone->{_header} = { %{$self->{_header}} };
+
+    return $clone;
+}
+
+=head2 $header = $prof->header();
+
+Returns a reference to a hash of header values.  These are the key
+value pairs included in the header section of the DBI::ProfileDumper
+data format.  For example:
+
+  $header = {
+    Path    => [ '!Statement', '!MethodName' ],
+    Program => 't/42profile_data.t',
+  };
+
+Note that modifying this hash will modify the header data stored
+inside the profile object.
+
+=cut
+
+sub header { shift->{_header} }
+
+
+=head2 $nodes = $prof->nodes()
+
+Returns a reference the sorted nodes array.  Each element in the array
+is a single record in the data set.  The first seven elements are the
+same as the elements provided by DBI::Profile.  After that each key is
+in a separate element.  For example:
+
+ $nodes = [
+            [
+              2,                      # 0, count
+              0.0312958955764771,     # 1, total duration
+              0.000490069389343262,   # 2, first duration
+              0.000176072120666504,   # 3, shortest duration
+              0.00140702724456787,    # 4, longest duration
+              1023115819.83019,       # 5, time of first event
+              1023115819.86576,       # 6, time of last event
+              'SELECT foo FROM bar'   # 7, key1
+              'execute'               # 8, key2
+                                      # 6+N, keyN
+            ],
+                                      # ...
+          ];
+
+Note that modifying this array will modify the node data stored inside
+the profile object.
+
+=cut
+
+sub nodes { shift->{_nodes} }
+
+
+=head2 $count = $prof->count()
+
+Returns the number of items in the profile data set.
+
+=cut
+
+sub count { scalar @{shift->{_nodes}} }
+
+
+=head2 $prof->sort(field => "field")
+
+=head2 $prof->sort(field => "field", reverse => 1)
+
+Sorts data by the given field.  Available fields are:
+
+  longest
+  total
+  count
+  shortest
+
+The default sort is greatest to smallest, which is the opposite of the
+normal Perl meaning.  This, however, matches the expected behavior of
+the dbiprof frontend.
+
+=cut
+
+
+# sorts data by one of the available fields
+{
+    my %FIELDS = (
+                  longest  => LONGEST,
+                  total    => TOTAL,
+                  count    => COUNT,
+                  shortest => SHORTEST,
+                  key1     => PATH+0,
+                  key2     => PATH+1,
+                  key3     => PATH+2,
+                 );
+    sub sort {
+        my $self = shift;
+        my $nodes = $self->{_nodes};
+        my %opt = @_;
+        
+        croak("Missing required field option.") unless $opt{field};
+
+        my $index = $FIELDS{$opt{field}};
+        
+        croak("Unrecognized sort field '$opt{field}'.")
+          unless defined $index;
+
+        # sort over index
+        if ($opt{reverse}) {
+            @$nodes = sort { 
+                $a->[$index] <=> $b->[$index] 
+            } @$nodes;
+        } else {
+            @$nodes = sort { 
+                $b->[$index] <=> $a->[$index] 
+            } @$nodes;
+        }
+
+        # remember how we're sorted
+        $self->{_sort} = $opt{field};
+
+        return $self;
+    }
+}
+
+
+=head2 $count = $prof->exclude(key2 => "disconnect")
+
+=head2 $count = $prof->exclude(key2 => "disconnect", case_sensitive => 1)
+
+=head2 $count = $prof->exclude(key1 => qr/^SELECT/i)
+
+Removes records from the data set that match the given string or
+regular expression.  This method modifies the data in a permanent
+fashion - use clone() first to maintain the original data after
+exclude().  Returns the number of nodes left in the profile data set.
+
+=cut
+
+sub exclude {
+    my $self = shift;
+    my $nodes = $self->{_nodes};
+    my %opt = @_;
+
+    # find key index number
+    my ($index, $val);
+    foreach (keys %opt) {
+        if (/^key(\d+)$/) {
+            $index   = PATH + $1 - 1;
+            $val     = $opt{$_};
+            last;
+        }
+    }
+    croak("Missing required keyN option.") unless $index;
+
+    if (UNIVERSAL::isa($val,"Regexp")) {
+        # regex match
+        @$nodes = grep {
+            $#$_ < $index or $_->[$index] !~ /$val/ 
+        } @$nodes;
+    } else {
+        if ($opt{case_sensitive}) {
+            @$nodes = grep { 
+                $#$_ < $index or $_->[$index] ne $val;
+            } @$nodes;
+        } else {
+            $val = lc $val;
+            @$nodes = grep { 
+                $#$_ < $index or lc($_->[$index]) ne $val;
+            } @$nodes;
+        }
+    }
+
+    return scalar @$nodes;
+}
+
+
+=head2 $count = $prof->match(key2 => "disconnect")
+
+=head2 $count = $prof->match(key2 => "disconnect", case_sensitive => 1)
+
+=head2 $count = $prof->match(key1 => qr/^SELECT/i)
+
+Removes records from the data set that do not match the given string
+or regular expression.  This method modifies the data in a permanent
+fashion - use clone() first to maintain the original data after
+match().  Returns the number of nodes left in the profile data set.
+
+=cut
+
+sub match {
+    my $self = shift;
+    my $nodes = $self->{_nodes};
+    my %opt = @_;
+
+    # find key index number
+    my ($index, $val);
+    foreach (keys %opt) {
+        if (/^key(\d+)$/) {
+            $index   = PATH + $1 - 1;
+            $val     = $opt{$_};
+            last;
+        }
+    }
+    croak("Missing required keyN option.") unless $index;
+
+    if (UNIVERSAL::isa($val,"Regexp")) {
+        # regex match
+        @$nodes = grep {
+            $#$_ >= $index and $_->[$index] =~ /$val/ 
+        } @$nodes;
+    } else {
+        if ($opt{case_sensitive}) {
+            @$nodes = grep { 
+                $#$_ >= $index and $_->[$index] eq $val;
+            } @$nodes;
+        } else {
+            $val = lc $val;
+            @$nodes = grep { 
+                $#$_ >= $index and lc($_->[$index]) eq $val;
+            } @$nodes;
+        }
+    }
+
+    return scalar @$nodes;
+}
+
+
+=head2 $Data = $prof->Data()
+
+Returns the same Data hash structure as seen in DBI::Profile.  This
+structure is not sorted.  The nodes() structure probably makes more
+sense for most analysis.
+
+=cut
+
+sub Data {
+    my $self = shift;
+    my (%Data, @data, $ptr);
+
+    foreach my $node (@{$self->{_nodes}}) {
+        # traverse to key location
+        $ptr = \%Data;
+        foreach my $key (@{$node}[PATH .. $#$node - 1]) {
+            $ptr->{$key} = {} unless exists $ptr->{$key};
+            $ptr = $ptr->{$key};
+        }
+
+        # slice out node data
+        $ptr->{$node->[-1]} = [ @{$node}[0 .. 6] ];
+    }
+
+    return \%Data;
+}
+
+
+=head2 $text = $prof->format($nodes->[0])
+
+Formats a single node into a human-readable block of text.
+
+=cut
+
+sub format {
+    my ($self, $node) = @_;
+    my $format;
+    
+    # setup keys
+    my $keys = "";
+    for (my $i = PATH; $i <= $#$node; $i++) {
+        my $key = $node->[$i];
+        
+        # remove leading and trailing space
+        $key =~ s/^\s+//;
+        $key =~ s/\s+$//;
+
+        # if key has newlines or is long take special precautions
+        if (length($key) > 72 or $key =~ /\n/) {
+            $keys .= "  Key " . ($i - PATH + 1) . "         :\n\n$key\n\n";
+        } else {
+            $keys .= "  Key " . ($i - PATH + 1) . "         : $key\n";
+        }
+    }
+
+    # nodes with multiple runs get the long entry format, nodes with
+    # just one run get a single count.
+    if ($node->[COUNT] > 1) {
+        $format = <<END;
+  Count         : %d
+  Total Time    : %3.6f seconds
+  Longest Time  : %3.6f seconds
+  Shortest Time : %3.6f seconds
+  Average Time  : %3.6f seconds
+END
+        return sprintf($format, @{$node}[COUNT,TOTAL,LONGEST,SHORTEST], 
+                       $node->[TOTAL] / $node->[COUNT]) . $keys;
+    } else {
+        $format = <<END;
+  Count         : %d
+  Time          : %3.6f seconds
+END
+
+        return sprintf($format, @{$node}[COUNT,TOTAL]) . $keys;
+
+    }
+}
+
+
+=head2 $text = $prof->report(number => 10)
+
+Produces a report with the given number of items.
+
+=cut
+
+sub report {
+    my $self  = shift;
+    my $nodes = $self->{_nodes};
+    my %opt   = @_;
+
+    croak("Missing required number option") unless exists $opt{number};
+
+    $opt{number} = @$nodes if @$nodes < $opt{number};
+
+    my $report = $self->_report_header($opt{number});
+    for (0 .. $opt{number} - 1) {
+        $report .= sprintf("#" x 5  . "[ %d ]". "#" x 59 . "\n", 
+                           $_ + 1);
+        $report .= $self->format($nodes->[$_]);
+        $report .= "\n";
+    }
+    return $report;
+}
+
+# format the header for report()
+sub _report_header {
+    my ($self, $number) = @_;
+    my $nodes = $self->{_nodes};
+    my $node_count = @$nodes;
+
+    # find total runtime and method count
+    my ($time, $count) = (0,0);
+    foreach my $node (@$nodes) {
+        $time  += $node->[TOTAL];
+        $count += $node->[COUNT];
+    }
+
+    my $header = <<END;
+
+DBI Profile Data ($self->{_profiler})
+
+END
+
+    # output header fields
+    while (my ($key, $value) = each %{$self->{_header}}) {
+        $header .= sprintf("  %-13s : %s\n", $key, $value);
+    }
+
+    # output summary data fields
+    $header .= sprintf(<<END, $node_count, $number, $self->{_sort}, $count, $time);
+  Total Records : %d (showing %d, sorted by %s)
+  Total Count   : %d
+  Total Runtime : %3.6f seconds  
+
+END
+
+    return $header;
+}
+
+
+1;
+
+__END__
+
+=head1 AUTHOR
+
+Sam Tregar <sam@tregar.com>
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2002 Sam Tregar
+
+This program is free software; you can redistribute it and/or modify
+it under the same terms as Perl 5 itself.
+
+=cut